--- /dev/null
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Deploy a Quantized Model on Cuda
+================================
+**Author**: `Wuwei Lin <https://github.com/vinx13>`_
+
+This article is an introductory tutorial of automatic quantization with TVM.
+Automatic quantization is one of the quantization modes in TVM. More details on
+the quantization story in TVM can be found
+`here <https://discuss.tvm.ai/t/quantization-story/3920>`_.
+In this tutorial, we will import a GluonCV pre-trained model on ImageNet to
+Relay, quantize the Relay model and then perform the inference.
+"""
+
+import tvm
+from tvm import relay
+import mxnet as mx
+from tvm.contrib.download import download_testdata
+from mxnet import gluon
+import logging
+import os
+
+batch_size = 1
+model_name = "resnet18_v1"
+target = 'cuda'
+ctx = tvm.context(target)
+
+###############################################################################
+# Prepare the Dataset
+# -------------------
+# We will demonstrate how to prepare the calibration dataset for quantization.
+# We first download the validation set of ImageNet and pre-process the dataset.
+calibration_rec = download_testdata(
+ 'http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec',
+ 'val_256_q90.rec')
+
+def get_val_data(num_workers=4):
+ mean_rgb = [123.68, 116.779, 103.939]
+ std_rgb = [58.393, 57.12, 57.375]
+
+ def batch_fn(batch):
+ return batch.data[0].asnumpy(), batch.label[0].asnumpy()
+
+ img_size = 299 if model_name == 'inceptionv3' else 224
+ val_data = mx.io.ImageRecordIter(
+ path_imgrec=calibration_rec,
+ preprocess_threads=num_workers,
+ shuffle=False,
+ batch_size=batch_size,
+ resize=256,
+ data_shape=(3, img_size, img_size),
+ mean_r=mean_rgb[0],
+ mean_g=mean_rgb[1],
+ mean_b=mean_rgb[2],
+ std_r=std_rgb[0],
+ std_g=std_rgb[1],
+ std_b=std_rgb[2],
+ )
+ return val_data, batch_fn
+
+
+###############################################################################
+# The calibration dataset should be an iterable object. We define the
+# calibration dataset as a generator object in Python. In this tutorial, we
+# only use a few samples for calibration.
+
+calibration_samples = 10
+
+def calibrate_dataset():
+ val_data, batch_fn = get_val_data()
+ val_data.reset()
+ for i, batch in enumerate(val_data):
+ if i * batch_size >= calibration_samples:
+ break
+ data, _ = batch_fn(batch)
+ yield {'data': data}
+
+
+###############################################################################
+# Import the model
+# ----------------
+# We use the Relay MxNet frontend to import a model from the Gluon model zoo.
+def get_model():
+ gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)
+ img_size = 299 if model_name == 'inceptionv3' else 224
+ data_shape = (batch_size, 3, img_size, img_size)
+ mod, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape})
+ return mod, params
+
+
+###############################################################################
+# Quantize the Model
+# ------------------
+# In quantization, we need to find the scale for each weight and intermediate
+# feature map tensor of each layer.
+#
+# For weights, the scales are directly calculated based on the value of the
+# weights. Two modes are supported: `power2` and `max`. Both modes find the
+# maximum value within the weight tensor first. In `power2` mode, the maximum
+# is rounded down to power of two. If the scales of both weights and
+# intermediate feature maps are power of two, we can leverage bit shifting for
+# multiplications. This make it computationally more efficient. In `max` mode,
+# the maximum is used as the scale. Without rounding, `max` mode might have
+# better accuracy in some cases. When the scales are not powers of two, fixed
+# point multiplications will be used.
+#
+# For intermediate feature maps, we can find the scales with data-aware
+# quantization. Data-aware quantization takes a calibration dataset as the
+# input argument. Scales are calculated by minimizing the KL divergence between
+# distribution of activation before and after quantization.
+# Alternatively, we can also use pre-defined global scales. This saves the time
+# for calibration. But the accuracy might be impacted.
+
+def quantize(mod, params, data_aware):
+ if data_aware:
+ with relay.quantize.qconfig(calibrate_mode='kl_divergence', weight_scale='max'):
+ mod = relay.quantize.quantize(mod, params, dataset=calibrate_dataset())
+ else:
+ with relay.quantize.qconfig(calibrate_mode='global', global_scale=8.0):
+ mod = relay.quantize.quantize(mod, params)
+ return mod
+
+
+###############################################################################
+# Run Inference
+# -------------
+# We create a Relay VM to build and execute the model.
+def run_inference(mod):
+ executor = relay.create_executor('vm', mod, ctx, target)
+ val_data, batch_fn = get_val_data()
+ for i, batch in enumerate(val_data):
+ data, label = batch_fn(batch)
+ prediction = executor.evaluate()(data)
+ if i > 10: # only run inference on a few samples in this tutorial
+ break
+
+def main():
+ mod, params = get_model()
+ mod = quantize(mod, params, data_aware=True)
+ run_inference(mod)
+
+if __name__ == '__main__':
+ main()