.. note:: :class: sphx-glr-download-link-note Click :ref:`here ` to download the full example code .. rst-class:: sphx-glr-example-title .. _sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py: Auto-tuning a ALU fused op on VTA --------------------------------- .. code-block:: default import os from mxnet.gluon.model_zoo import vision import numpy as np from PIL import Image from tvm import topi import tvm from tvm import te from tvm import rpc, autotvm, relay from tvm.contrib import download from tvm.autotvm.measure.measure_methods import request_remote from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.autotvm import record import vta from vta.testing import simulator from vta.top import graph_pack import copy Compile network --------------- Perform vta-specific compilation with Relay from a Gluon model .. code-block:: default def compile_network(env, target, model, start_pack, stop_pack): # Populate the shape and data type dictionary dtype_dict = {"data": "float32"} shape_dict = {"data": (env.BATCH, 3, 224, 224)} # Get off the shelf gluon model, and convert to relay gluon_model = vision.get_model(model, pretrained=True) mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay # Note: We set opt_level to 3 in order to fold batch norm with relay.build_config(opt_level=3): with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): mod = relay.quantize.quantize(mod, params=params) # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack( mod["main"], env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack, ) return relay_prog, params Set Tuning Options ------------------ Before tuning, we should apply some configurations. Here we use an Pynq-Z1 board as an example. .. code-block:: default # Tracker host and port can be set by your environment tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0") tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190)) # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # This target is used for cross compilation. You can query it by :code:`gcc -v` on your device. # Set ``device=arm_cpu`` to run inference on the CPU # or ``device=vta`` to run inference on the FPGA. device = "vta" target = env.target if device == "vta" else env.target_vta_cpu # Name of Gluon model to compile # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. network = "resnet50_v2" start_pack = "nn.max_pool2d" stop_pack = "nn.global_avg_pool2d" # Tuning option log_file = "%s.alu.%s.log" % (device, network) tuning_option = { "log_filename": log_file, "tuner": "random", "n_trial": 1000, "early_stopping": None, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(n_parallel=1), runner=autotvm.RPCRunner( env.TARGET, host=tracker_host, port=tracker_port, number=5, timeout=60, # check_correctness=True, # TODO: re-enable when check_correctness works again. ), ), } def log_to_file(file_out, protocol="json"): """Log the tuning records into file. The rows of the log are stored in the format of autotvm.record.encode. for lhs == rhs, we add an extra rhs = [] record Parameters ---------- file_out : str The file to log to. protocol: str, optional The log protocol. Can be 'json' or 'pickle' Returns ------- callback : callable Callback function to do the logging. """ def _callback(_, inputs, results): with open(file_out, "a") as f: for inp, result in zip(inputs, results): f.write(record.encode(inp, result, protocol) + "\n") # we only consider task with same lhs and rhs if inp.task.args[0] == inp.task.args[1]: args = list(inp.task.args) args[1] = (args[0][0], (), args[0][2]) inp_copy = copy.deepcopy(inp) inp_copy.task.args = tuple(args) f.write(record.encode(inp_copy, result, protocol) + "\n") return _callback def tune_tasks( tasks, measure_option, tuner="xgb", n_trial=10, early_stopping=None, log_filename="tuning.log", use_transfer_learning=True, ): # create tmp log file tmp_log_file = log_filename + ".tmp" if os.path.exists(tmp_log_file): os.remove(tmp_log_file) for i, tsk in enumerate(reversed(tasks)): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) # create tuner if tuner == "xgb" or tuner == "xgb-rank": tuner_obj = XGBTuner(tsk, loss_type="rank") elif tuner == "xgb_knob": tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob") elif tuner == "ga": tuner_obj = GATuner(tsk, pop_size=50) elif tuner == "random": tuner_obj = RandomTuner(tsk) elif tuner == "gridsearch": tuner_obj = GridSearchTuner(tsk) else: raise ValueError("Invalid tuner: " + tuner) if use_transfer_learning: if os.path.isfile(tmp_log_file): tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) # do tuning tsk_trial = min(n_trial, len(tsk.config_space)) tuner_obj.tune( n_trial=tsk_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(tsk_trial, prefix=prefix), log_to_file(tmp_log_file), ], ) # pick best records to a cache file autotvm.record.pick_best(tmp_log_file, log_filename) os.remove(tmp_log_file) Register VTA-specific tuning tasks .. code-block:: default def register_vta_tuning_tasks(): from tvm.autotvm.task import TaskExtractEnv @tvm.te.tag_scope(tag=topi.tag.ELEMWISE) def my_clip(x, a_min, a_max): """Unlike topi's current clip, put min and max into two stages.""" const_min = tvm.tir.const(a_min, x.dtype) const_max = tvm.tir.const(a_max, x.dtype) x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA") x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB") return x # init autotvm env to register VTA operator TaskExtractEnv() @autotvm.template("add.vta") def _topi_add(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" A, B = args[:2] with tvm.target.vta(): res = vta.top.op.add_packed(*args, **kwargs) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == "vta": s = vta.top.op.schedule_add_packed([res]) else: s = te.create_schedule([res.op]) return s, [A, B, res] @autotvm.template("multiply.vta") def _topi_multiply(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" A, B = args[:2] with tvm.target.vta(): res = vta.top.op.multiply_packed(*args, **kwargs) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == "vta": s = vta.top.op.schedule_multiply_packed([res]) else: s = te.create_schedule([res.op]) return s, [A, B, res] Finally, we launch tuning jobs and evaluate the end-to-end performance. .. code-block:: default def tune_and_evaluate(tuning_opt): if env.TARGET != "intelfocl": print("ALU only op only available for intelfocl target") return # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program( mod, params=params, ops=( relay.op.get("add"), relay.op.get("multiply"), ), target=target, target_host=env.target_host, ) # filter out non-packed alu task tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) # filter out float alu task tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks)) # We should have extracted 10 convolution tasks tasks_set = {} print("Extracted {} alu tasks:".format(len(tasks))) for tsk in tasks: print("tsk = ", tsk) if len(tsk.args[1][1]) == 0: args = list(tsk.args) args[1] = args[0] tsk.args = tuple(args) if (tsk.name, tsk.args) in tasks_set: print("task {} already exists".format(tsk)) tasks_set[(tsk.name, tsk.args)] = tsk tasks = list(tasks_set.values()) print("After merged, final #tasks={}, tasks = {}".format(len(tasks), tasks)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # Run the tuning and evaluate the results tune_and_evaluate(tuning_option) .. rst-class:: sphx-glr-script-out Out: .. code-block:: none ALU only op only available for intelfocl target .. _sphx_glr_download_topic_vta_tutorials_autotvm_tune_alu_vta.py: .. only :: html .. container:: sphx-glr-footer :class: sphx-glr-footer-example .. container:: sphx-glr-download :download:`Download Python source code: tune_alu_vta.py ` .. container:: sphx-glr-download :download:`Download Jupyter notebook: tune_alu_vta.ipynb ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_