[TUTORIAL]TFLite QNN Tutorial (#5595)

author Samuel <siju.samuel@huawei.com>

Thu, 21 May 2020 00:39:14 +0000 (06:09 +0530)

committer GitHub <noreply@github.com>

Thu, 21 May 2020 00:39:14 +0000 (09:39 +0900)
author Samuel <siju.samuel@huawei.com>
Thu, 21 May 2020 00:39:14 +0000 (06:09 +0530)
committer GitHub <noreply@github.com>
Thu, 21 May 2020 00:39:14 +0000 (09:39 +0900)
diff --git a/tutorials/frontend/deploy_prequantized_tflite.py b/tutorials/frontend/deploy_prequantized_tflite.py

new file mode 100644 (file)

index 0000000..f6c4544
--- /dev/null
+++ b/tutorials/frontend/deploy_prequantized_tflite.py
@@ -0,0 +1,251 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)
+================================================================
+**Author**: `Siju Samuel <https://github.com/siju-samuel>`_
+Welcome to part 3 of the Deploy Framework-Prequantized Model with TVM tutorial.
+In this part, we will start with a Quantized TFLite graph and then compile and execute it via TVM.
+
+
+For more details on quantizing the model using TFLite, readers are encouraged to
+go through `Converting Quantized Models
+<https://www.tensorflow.org/lite/convert/quantization>`_.
+
+The TFLite models can be downloaded from this `link
+<https://www.tensorflow.org/lite/guide/hosted_models>`_.
+
+To get started, Tensorflow and TFLite package needs to be installed as prerequisite.
+
+.. code-block:: bash
+
+    # install tensorflow and tflite
+    pip install tensorflow==2.1.0
+    pip install tflite==2.1.0
+
+Now please check if TFLite package is installed successfully, ``python -c "import tflite"``
+
+"""
+
+###############################################################################
+# Necessary imports
+# -----------------
+import os
+
+import numpy as np
+import tflite
+
+import tvm
+from tvm import relay
+
+
+######################################################################
+# Download pretrained Quantized TFLite model
+# ------------------------------------------
+
+# Download mobilenet V2 TFLite model provided by Google
+from tvm.contrib.download import download_testdata
+
+model_url = "https://storage.googleapis.com/download.tensorflow.org/models/" \
+             "tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz"
+
+# Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
+model_path = download_testdata(model_url, "mobilenet_v2_1.0_224_quant.tgz",
+                               module=['tf', 'official'])
+model_dir = os.path.dirname(model_path)
+
+
+######################################################################
+# Utils for downloading and extracting zip files
+# ----------------------------------------------
+def extract(path):
+    import tarfile
+    if path.endswith("tgz") or path.endswith("gz"):
+        dir_path = os.path.dirname(path)
+        tar = tarfile.open(path)
+        tar.extractall(path=dir_path)
+        tar.close()
+    else:
+        raise RuntimeError('Could not decompress the file: ' + path)
+
+extract(model_path)
+
+
+######################################################################
+# Load a test image
+# -----------------
+
+#######################################################################
+# Get a real image for e2e testing
+# --------------------------------
+def get_real_image(im_height, im_width):
+    from PIL import Image
+    repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'
+    img_name = 'elephant-299.jpg'
+    image_url = os.path.join(repo_base, img_name)
+    img_path = download_testdata(image_url, img_name, module='data')
+    image = Image.open(img_path).resize((im_height, im_width))
+    x = np.array(image).astype('uint8')
+    data = np.reshape(x, (1, im_height, im_width, 3))
+    return data
+
+data = get_real_image(224, 224)
+
+######################################################################
+# Load a tflite model
+# -------------------
+
+######################################################################
+# Now we can open mobilenet_v2_1.0_224.tflite
+tflite_model_file = os.path.join(model_dir, "mobilenet_v2_1.0_224_quant.tflite")
+tflite_model_buf = open(tflite_model_file, "rb").read()
+
+tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+
+
+###############################################################################
+# Lets run TFLite pre-quantized model inference and get the TFLite prediction.
+def run_tflite_model(tflite_model_buf, input_data):
+    """ Generic function to execute TFLite """
+    try:
+        from tensorflow import lite as interpreter_wrapper
+    except ImportError:
+        from tensorflow.contrib import lite as interpreter_wrapper
+
+    input_data = input_data if isinstance(input_data, list) else [input_data]
+
+    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # set input
+    assert len(input_data) == len(input_details)
+    for i in range(len(input_details)):
+        interpreter.set_tensor(input_details[i]['index'], input_data[i])
+
+    # Run
+    interpreter.invoke()
+
+    # get output
+    tflite_output = list()
+    for i in range(len(output_details)):
+        tflite_output.append(interpreter.get_tensor(output_details[i]['index']))
+
+    return tflite_output
+
+###############################################################################
+# Lets run TVM compiled pre-quantized model inference and get the TVM prediction.
+def run_tvm(graph, lib, params):
+    from tvm.contrib import graph_runtime
+    rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+    rt_mod.set_input(**params)
+    rt_mod.set_input('input', data)
+    rt_mod.run()
+    tvm_res = rt_mod.get_output(0).asnumpy()
+    tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1]
+    return tvm_pred, rt_mod
+
+
+###############################################################################
+# TFLite inference
+# ----------------
+
+###############################################################################
+# Run TFLite inference on the quantized model.
+tflite_res = run_tflite_model(tflite_model_buf, data)
+tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]
+
+###############################################################################
+# TVM compilation and inference
+# -----------------------------
+
+###############################################################################
+# We use the TFLite-Relay parser to convert the TFLite pre-quantized graph into Relay IR. Note that
+# frontend parser call for a pre-quantized model is exactly same as frontend parser call for a FP32
+# model. We encourage you to remove the comment from print(mod) and inspect the Relay module. You
+# will see many QNN operators, like, Requantize, Quantize and QNN Conv2D.
+dtype_dict = {'input': data.dtype.name}
+shape_dict = {'input': data.shape}
+
+mod, params = relay.frontend.from_tflite(tflite_model,
+                                         shape_dict=shape_dict,
+                                         dtype_dict=dtype_dict)
+# print(mod)
+
+###############################################################################
+# Lets now the compile the Relay module. We use the "llvm" target here. Please replace it with the
+# target platform that you are interested in.
+target = 'llvm'
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build_module.build(mod, target=target,
+                                                  params=params)
+
+###############################################################################
+# Finally, lets call inference on the TVM compiled module.
+tvm_pred, rt_mod = run_tvm(graph, lib, params)
+
+###############################################################################
+# Accuracy comparison
+# -------------------
+
+###############################################################################
+# Print the top-5 labels for MXNet and TVM inference.
+# Checking the labels because the requantize implementation is different between
+# TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via labels.
+
+print("TVM Top-5 labels:", tvm_pred)
+print("TFLite Top-5 labels:", tflite_pred)
+
+
+##########################################################################
+# Measure performance
+# -------------------
+# Here we give an example of how to measure performance of TVM compiled models.
+n_repeat = 100  # should be bigger to make the measurement more accurate
+ctx = tvm.cpu(0)
+ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat)
+prof_res = np.array(ftimer().results) * 1e3
+print("Elapsed average ms:", np.mean(prof_res))
+
+######################################################################
+# .. note::
+#
+#   Unless the hardware has special support for fast 8 bit instructions, quantized models are
+#   not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does
+#   quantized convolution in 16 bit, even if the model itself is 8 bit.
+#
+#   For x86, the best performance can be achieved on CPUs with AVX512 instructions set.
+#   In this case, TVM utilizes the fastest available 8 bit instructions for the given target.
+#   This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer).
+#   For EC2 C5.12x large instance, TVM latency for this tutorial is ~2 ms.
+#
+#   Intel conv2d NCHWc schedule on ARM gives better end-to-end latency compared to ARM NCHW
+#   conv2d spatial pack schedule for many TFLite networks. ARM winograd performance is higher but
+#   it has a high memory footprint.
+#
+#   Moreover, the following general tips for CPU performance equally applies:
+#
+#    * Set the environment variable TVM_NUM_THREADS to the number of physical cores
+#    * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or
+#      "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future)
+#    * Perform autotuning - `Auto-tuning a convolution network for x86 CPU
+#      <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html>`_.
+#    * To get best inference performance on ARM CPU, change target argument according to your
+#      device and follow `Auto-tuning a convolution network for ARM CPU
+#      <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html>`_.
author	Samuel <siju.samuel@huawei.com>
	Thu, 21 May 2020 00:39:14 +0000 (06:09 +0530)
committer	GitHub <noreply@github.com>
	Thu, 21 May 2020 00:39:14 +0000 (09:39 +0900)