Compile PyTorch Object Detection Models

This article is an introductory tutorial to deploy PyTorch object detection models with Relay VM.

For us to begin with, PyTorch should be installed. TorchVision is also required since we will be using it as our model zoo.

A quick solution is to install via pip

pip install torch
pip install torchvision

or please refer to official site https://pytorch.org/get-started/locally/

PyTorch versions should be backwards compatible but should be used with the proper TorchVision version.

Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may be unstable.

import tvm
from tvm import relay
from tvm import relay
from tvm.runtime.vm import VirtualMachine
from tvm.contrib.download import download_testdata

import numpy as np
import cv2

# PyTorch imports
import torch
import torchvision

Load pre-trained maskrcnn from torchvision and do tracing

in_size = 300

input_shape = (1, 3, in_size, in_size)


def do_trace(model, inp):
    model_trace = torch.jit.trace(model, inp)
    model_trace.eval()
    return model_trace


def dict_to_tuple(out_dict):
    if "masks" in out_dict.keys():
        return out_dict["boxes"], out_dict["scores"], out_dict["labels"], out_dict["masks"]
    return out_dict["boxes"], out_dict["scores"], out_dict["labels"]


class TraceWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, inp):
        out = self.model(inp)
        return dict_to_tuple(out[0])


model_func = torchvision.models.detection.maskrcnn_resnet50_fpn
model = TraceWrapper(model_func(pretrained=True))

model.eval()
inp = torch.Tensor(np.random.uniform(0.0, 250.0, size=(1, 3, in_size, in_size)))

with torch.no_grad():
    out = model(inp)
    script_module = do_trace(model, inp)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
  warnings.warn(
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth

  0%|          | 0.00/170M [00:00<?, ?B/s]
  3%|2         | 4.50M/170M [00:00<00:03, 46.3MB/s]
  6%|5         | 9.80M/170M [00:00<00:03, 50.8MB/s]
  9%|8         | 14.6M/170M [00:00<00:04, 36.2MB/s]
 11%|#         | 18.5M/170M [00:00<00:04, 36.8MB/s]
 13%|#3        | 22.2M/170M [00:00<00:04, 36.3MB/s]
 15%|#5        | 25.8M/170M [00:00<00:04, 33.0MB/s]
 18%|#8        | 30.6M/170M [00:00<00:03, 37.9MB/s]
 21%|##        | 35.0M/170M [00:00<00:03, 40.2MB/s]
 23%|##3       | 39.7M/170M [00:01<00:03, 42.7MB/s]
 26%|##5       | 44.0M/170M [00:01<00:03, 43.3MB/s]
 28%|##8       | 48.2M/170M [00:01<00:02, 43.4MB/s]
 31%|###1      | 52.7M/170M [00:01<00:02, 44.6MB/s]
 34%|###3      | 57.0M/170M [00:01<00:02, 44.0MB/s]
 36%|###6      | 61.8M/170M [00:01<00:02, 45.5MB/s]
 39%|###8      | 66.1M/170M [00:01<00:02, 39.7MB/s]
 41%|####1     | 70.1M/170M [00:01<00:03, 33.1MB/s]
 43%|####3     | 73.5M/170M [00:01<00:03, 33.0MB/s]
 46%|####6     | 78.6M/170M [00:02<00:02, 38.3MB/s]
 49%|####8     | 82.5M/170M [00:02<00:02, 39.0MB/s]
 52%|#####1    | 87.7M/170M [00:02<00:01, 43.2MB/s]
 55%|#####5    | 93.7M/170M [00:02<00:01, 48.5MB/s]
 58%|#####8    | 98.9M/170M [00:02<00:01, 50.1MB/s]
 61%|######1   | 104M/170M [00:02<00:01, 45.1MB/s]
 64%|######3   | 108M/170M [00:02<00:01, 35.6MB/s]
 66%|######6   | 112M/170M [00:02<00:01, 37.8MB/s]
 69%|######9   | 118M/170M [00:03<00:01, 41.8MB/s]
 72%|#######2  | 122M/170M [00:03<00:01, 44.0MB/s]
 75%|#######4  | 127M/170M [00:03<00:01, 42.3MB/s]
 77%|#######7  | 131M/170M [00:03<00:00, 41.3MB/s]
 80%|#######9  | 135M/170M [00:03<00:00, 41.3MB/s]
 82%|########2 | 140M/170M [00:03<00:00, 42.5MB/s]
 85%|########4 | 144M/170M [00:03<00:00, 42.1MB/s]
 87%|########7 | 148M/170M [00:03<00:00, 42.7MB/s]
 90%|########9 | 152M/170M [00:03<00:00, 43.8MB/s]
 92%|#########2| 157M/170M [00:04<00:00, 39.9MB/s]
 95%|#########4| 161M/170M [00:04<00:00, 39.2MB/s]
 98%|#########7| 166M/170M [00:04<00:00, 43.2MB/s]
100%|##########| 170M/170M [00:04<00:00, 40.5MB/s]
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/generalized_rcnn.py:75: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results).
  for img in images:
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/transform.py:110: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results).
  images = [img for img in images]
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/transform.py:155: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/transform.py:156: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torch/nn/functional.py:3912: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torch/nn/functional.py:3912: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/_utils.py:176: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if box_sum > 0:
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/_utils.py:216: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/_utils.py:217: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/_utils.py:179: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if box_sum > 0:
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/_utils.py:519: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  min_kval = torch.min(torch.cat((torch.tensor([orig_kval], dtype=axis_dim_val.dtype), axis_dim_val), 0))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/rpn.py:275: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results).
  for boxes, scores, lvl, img_shape in zip(proposals, objectness_prob, levels, image_shapes):
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/boxes.py:156: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/boxes.py:157: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/boxes.py:157: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/boxes.py:158: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/boxes.py:159: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/boxes.py:159: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/boxes.py:72: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if boxes.numel() > (4000 if boxes.device.type == "cpu" else 20000) and not torchvision._is_tracing():
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/poolers.py:82: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0) + torch.tensor(self.eps, dtype=s.dtype))
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/ops/poolers.py:185: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results.
  num_rois = len(rois)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torch/__init__.py:1209: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert condition, message
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/transform.py:298: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  torch.tensor(s, dtype=torch.float32, device=boxes.device)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/transform.py:298: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  torch.tensor(s, dtype=torch.float32, device=boxes.device)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/transform.py:299: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/transform.py:299: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/roi_heads.py:389: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
/venv/apache-tvm-py3.8/lib/python3.8/site-packages/torchvision/models/detection/roi_heads.py:389: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)

Download a test image and pre-process

img_url = (
    "https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/detection/street_small.jpg"
)
img_path = download_testdata(img_url, "test_street_small.jpg", module="data")

img = cv2.imread(img_path).astype("float32")
img = cv2.resize(img, (in_size, in_size))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = np.transpose(img / 255.0, [2, 0, 1])
img = np.expand_dims(img, axis=0)

Import the graph to Relay

/workspace/python/tvm/relay/build_module.py:345: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
  warnings.warn(

Compile with Relay VM

Note: Currently only CPU target is supported. For x86 target, it is highly recommended to build TVM with Intel MKL and Intel OpenMP to get best performance, due to the existence of large dense operator in torchvision rcnn models.

# Add "-libs=mkl" to get best performance on x86 target.
# For x86 machine supports AVX512, the complete target is
# "llvm -mcpu=skylake-avx512 -libs=mkl"
target = "llvm"

with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]):
    vm_exec = relay.vm.compile(mod, target=target, params=params)

Inference with Relay VM

dev = tvm.cpu()
vm = VirtualMachine(vm_exec, dev)
vm.set_input("main", **{input_name: img})
tvm_res = vm.run()

Get boxes with score larger than 0.9

score_threshold = 0.9
boxes = tvm_res[0].numpy().tolist()
valid_boxes = []
for i, score in enumerate(tvm_res[1].numpy().tolist()):
    if score > score_threshold:
        valid_boxes.append(boxes[i])
    else:
        break

print("Get {} valid boxes".format(len(valid_boxes)))
Get 9 valid boxes

Total running time of the script: ( 3 minutes 47.805 seconds)

Gallery generated by Sphinx-Gallery