Adding more performance metrics
authoralered01 <Alex.Redshaw@arm.com>
Thu, 7 May 2020 13:58:29 +0000 (14:58 +0100)
committerAlex Redshaw <Alex.Redshaw@arm.com>
Fri, 22 May 2020 11:05:07 +0000 (11:05 +0000)
* Implemented CLTuning flow for ExecuteNetwork tests
  * Added --tuning-path to specify tuning file to use/create
  * Added --tuning-level to specify tuning level to use as well as enable extra tuning run to generate the tuning file
* Fixed issue where TuningLevel was being parsed incorrectly
* Added measurements for initialization, network parsing, network optimization, tuning, and shutdown
* Added flag to control number of iterations inference is run for

Signed-off-by: alered01 <Alex.Redshaw@arm.com>
Change-Id: Ic739ff26e136e32aff9f0995217c1c3207008ca4

include/armnn/utility/Timer.hpp [new file with mode: 0644]
src/armnn/Runtime.cpp
src/backends/cl/ClBackendContext.cpp
tests/ExecuteNetwork/ExecuteNetwork.cpp
tests/InferenceModel.hpp
tests/NetworkExecutionUtils/NetworkExecutionUtils.hpp

diff --git a/include/armnn/utility/Timer.hpp b/include/armnn/utility/Timer.hpp
new file mode 100644 (file)
index 0000000..daf689e
--- /dev/null
@@ -0,0 +1,25 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <chrono>
+#include <iomanip>
+
+namespace armnn
+{
+
+inline std::chrono::high_resolution_clock::time_point GetTimeNow()
+{
+    return std::chrono::high_resolution_clock::now();
+}
+
+inline std::chrono::duration<double, std::milli> GetTimeDuration(
+        std::chrono::high_resolution_clock::time_point start_time)
+{
+    return std::chrono::duration<double, std::milli>(GetTimeNow() - start_time);
+}
+
+}
\ No newline at end of file
index dbdd409..b1b7d51 100644 (file)
@@ -7,6 +7,7 @@
 #include <armnn/Version.hpp>
 #include <armnn/BackendRegistry.hpp>
 #include <armnn/Logging.hpp>
+#include <armnn/utility/Timer.hpp>
 
 #include <armnn/backends/IBackendContext.hpp>
 #include <backendsCommon/DynamicBackendUtils.hpp>
@@ -171,6 +172,7 @@ Runtime::Runtime(const CreationOptions& options)
     : m_NetworkIdCounter(0),
       m_ProfilingService(*this)
 {
+    const auto start_time = armnn::GetTimeNow();
     ARMNN_LOG(info) << "ArmNN v" << ARMNN_VERSION << "\n";
 
     if ( options.m_ProfilingOptions.m_TimelineEnabled && !options.m_ProfilingOptions.m_EnableProfiling )
@@ -225,10 +227,14 @@ Runtime::Runtime(const CreationOptions& options)
     m_ProfilingService.ConfigureProfilingService(options.m_ProfilingOptions);
 
     m_DeviceSpec.AddSupportedBackends(supportedBackends);
+
+    ARMNN_LOG(info) << "Initialization time: " << std::setprecision(2)
+                    << std::fixed << armnn::GetTimeDuration(start_time).count() << " ms\n";
 }
 
 Runtime::~Runtime()
 {
+    const auto start_time = armnn::GetTimeNow();
     std::vector<int> networkIDs;
     try
     {
@@ -272,6 +278,8 @@ Runtime::~Runtime()
     m_BackendContexts.clear();
 
     BackendRegistryInstance().SetProfilingService(armnn::EmptyOptional());
+    ARMNN_LOG(info) << "Shutdown time: " << std::setprecision(2)
+                    << std::fixed << armnn::GetTimeDuration(start_time).count() << " ms\n";
 }
 
 LoadedNetwork* Runtime::GetLoadedNetworkPtr(NetworkId networkId) const
index bfe93bd..42f42b3 100644 (file)
@@ -79,7 +79,7 @@ TuningLevel ParseTuningLevel(const BackendOptions::Var& value, TuningLevel defau
 {
     if (value.IsInt())
     {
-        int v = value.IsInt();
+        int v = value.AsInt();
         if (v > static_cast<int>(TuningLevel::Exhaustive) ||
             v < static_cast<int>(TuningLevel::None))
         {
@@ -218,18 +218,18 @@ ClBackendContext::ClBackendContext(const IRuntime::CreationOptions& options)
 
         ConfigureTuner(*(m_Tuner.get()), tuningLevel);
 
-        if (!m_TuningFile.empty())
+        if (!m_TuningFile.empty() && tuningLevel == TuningLevel::None)
         {
             try
             {
                 m_Tuner->load_from_file(m_TuningFile.c_str());
-            } catch (const std::exception& e)
+            }
+            catch (const std::exception& e)
             {
                 ARMNN_LOG(warning) << "Could not load GpuAcc tuner data file.";
             }
-
-            tuner = m_Tuner.get();
         }
+        tuner = m_Tuner.get();
     }
 
     m_ClContextControlWrapper = std::make_unique<ClContextControlWrapper>(
index 57b8692..66d8e13 100644 (file)
@@ -35,6 +35,10 @@ int main(int argc, const char* argv[])
     uint32_t counterCapturePeriod;
     std::string fileFormat;
 
+    size_t iterations = 1;
+    int tuningLevel = 0;
+    std::string tuningPath;
+
     double thresholdTime = 0.0;
 
     size_t subgraphId = 0;
@@ -121,6 +125,14 @@ int main(int argc, const char* argv[])
              "If profiling is enabled in 'file-only' mode this is the capture period that will be used in the test")
             ("file-format", po::value(&fileFormat)->default_value("binary"),
              "If profiling is enabled specifies the output file format")
+            ("iterations", po::value<size_t>(&iterations)->default_value(1),
+             "Number of iterations to run the network for, default is set to 1")
+            ("tuning-path", po::value(&tuningPath),
+            "Path to tuning file. Enables use of CL tuning")
+            ("tuning-level", po::value<int>(&tuningLevel)->default_value(0),
+             "Sets the tuning level which enables a tuning run which will update/create a tuning file. "
+             "Available options are: 1 (Rapid), 2 (Normal), 3 (Exhaustive). "
+             "Requires tuning-path to be set, default is set to 0 (No tuning run)")
             ("parse-unsupported", po::bool_switch()->default_value(false),
                 "Add unsupported operators as stand-in layers (where supported by parser)");
     }
@@ -275,6 +287,33 @@ int main(int argc, const char* argv[])
         // Remove duplicates from the list of compute devices.
         RemoveDuplicateDevices(computeDevices);
 
+#if defined(ARMCOMPUTECL_ENABLED)
+        std::shared_ptr<armnn::IGpuAccTunedParameters> tuned_params;
+
+        if (tuningPath != "")
+        {
+            if (tuningLevel != 0)
+            {
+                RunCLTuning(tuningPath, tuningLevel, modelFormat, inputTensorShapes, computeDevices,
+                    dynamicBackendsPath, modelPath, inputNames, inputTensorDataFilePaths, inputTypes, quantizeInput,
+                    outputTypes, outputNames, outputTensorFiles, dequantizeOutput, enableProfiling,
+                    enableFp16TurboMode, enableBf16TurboMode, thresholdTime, printIntermediate, subgraphId,
+                    enableLayerDetails, parseUnsupported);
+            }
+            ARMNN_LOG(info) << "Using tuning params: " << tuningPath << "\n";
+            options.m_BackendOptions.emplace_back(
+                armnn::BackendOptions
+                {
+                    "GpuAcc",
+                    {
+                        {"TuningLevel", 0},
+                        {"TuningFile", tuningPath.c_str()},
+                        {"KernelProfilingEnabled", enableProfiling}
+                    }
+                }
+            );
+        }
+#endif
         try
         {
             CheckOptionDependencies(vm);
@@ -288,9 +327,9 @@ int main(int argc, const char* argv[])
         // Create runtime
         std::shared_ptr<armnn::IRuntime> runtime(armnn::IRuntime::Create(options));
 
-        return RunTest(modelFormat, inputTensorShapes, computeDevices, dynamicBackendsPath, modelPath, inputNames,
-                       inputTensorDataFilePaths, inputTypes, quantizeInput, outputTypes, outputNames,
-                       outputTensorFiles, dequantizeOutput, enableProfiling, enableFp16TurboMode, enableBf16TurboMode,
-                       thresholdTime, printIntermediate, subgraphId, enableLayerDetails, parseUnsupported, runtime);
+        return RunTest(modelFormat, inputTensorShapes, computeDevices, dynamicBackendsPath, modelPath,
+            inputNames, inputTensorDataFilePaths, inputTypes, quantizeInput, outputTypes, outputNames,
+            outputTensorFiles, dequantizeOutput, enableProfiling, enableFp16TurboMode, enableBf16TurboMode,
+            thresholdTime, printIntermediate, subgraphId, enableLayerDetails, parseUnsupported, iterations, runtime);
     }
 }
index 410bc7c..781cef4 100644 (file)
@@ -6,6 +6,8 @@
 #pragma once
 
 #include <armnn/ArmNN.hpp>
+#include <armnn/Logging.hpp>
+#include <armnn/utility/Timer.hpp>
 #include <armnn/BackendRegistry.hpp>
 #include <armnn/utility/Assert.hpp>
 
@@ -31,7 +33,6 @@
 #include <boost/variant.hpp>
 
 #include <algorithm>
-#include <chrono>
 #include <iterator>
 #include <fstream>
 #include <map>
@@ -399,8 +400,12 @@ public:
             throw armnn::Exception("Some backend IDs are invalid: " + invalidBackends);
         }
 
+        const auto parsing_start_time = armnn::GetTimeNow();
         armnn::INetworkPtr network = CreateNetworkImpl<IParser>::Create(params, m_InputBindings, m_OutputBindings);
 
+        ARMNN_LOG(info) << "Network parsing time: " << std::setprecision(2)
+                        << std::fixed << armnn::GetTimeDuration(parsing_start_time).count() << " ms\n";
+
         armnn::IOptimizedNetworkPtr optNet{nullptr, [](armnn::IOptimizedNetwork*){}};
         {
             ARMNN_SCOPED_HEAP_PROFILING("Optimizing");
@@ -410,7 +415,12 @@ public:
             options.m_ReduceFp32ToBf16 = params.m_EnableBf16TurboMode;
             options.m_Debug = params.m_PrintIntermediateLayers;
 
+            const auto optimization_start_time = armnn::GetTimeNow();
             optNet = armnn::Optimize(*network, params.m_ComputeDevices, m_Runtime->GetDeviceSpec(), options);
+
+            ARMNN_LOG(info) << "Optimization time: " << std::setprecision(2)
+                            << std::fixed << armnn::GetTimeDuration(optimization_start_time).count() << " ms\n";
+
             if (!optNet)
             {
                 throw armnn::Exception("Optimize returned nullptr");
@@ -494,13 +504,13 @@ public:
         }
 
         // Start timer to record inference time in EnqueueWorkload (in milliseconds)
-        const auto start_time = GetCurrentTime();
+        const auto start_time = armnn::GetTimeNow();
 
         armnn::Status ret = m_Runtime->EnqueueWorkload(m_NetworkIdentifier,
                                                        MakeInputTensors(inputContainers),
                                                        MakeOutputTensors(outputContainers));
 
-        const auto end_time = GetCurrentTime();
+        const auto duration = armnn::GetTimeDuration(start_time);
 
         // if profiling is enabled print out the results
         if (profiler && profiler->IsProfilingEnabled())
@@ -514,7 +524,7 @@ public:
         }
         else
         {
-            return std::chrono::duration<double, std::milli>(end_time - start_time);
+            return duration;
         }
     }
 
@@ -584,17 +594,4 @@ private:
     {
         return armnnUtils::MakeOutputTensors(m_OutputBindings, outputDataContainers);
     }
-
-    std::chrono::high_resolution_clock::time_point GetCurrentTime()
-    {
-        return std::chrono::high_resolution_clock::now();
-    }
-
-    std::chrono::duration<double, std::milli> GetTimeDuration(
-            std::chrono::high_resolution_clock::time_point& start_time,
-            std::chrono::high_resolution_clock::time_point& end_time)
-    {
-        return std::chrono::duration<double, std::milli>(end_time - start_time);
-    }
-
 };
index a922228..ec0eaf9 100644 (file)
@@ -4,6 +4,7 @@
 //
 #include <armnn/ArmNN.hpp>
 #include <armnn/TypesUtils.hpp>
+#include <armnn/utility/Timer.hpp>
 
 #if defined(ARMNN_SERIALIZER)
 #include "armnnDeserializer/IDeserializer.hpp"
@@ -378,7 +379,8 @@ struct ExecuteNetworkParams
 
 template<typename TParser, typename TDataType>
 int MainImpl(const ExecuteNetworkParams& params,
-             const std::shared_ptr<armnn::IRuntime>& runtime = nullptr)
+             const std::shared_ptr<armnn::IRuntime>& runtime = nullptr,
+             size_t iterations = 1)
 {
     using TContainer = boost::variant<std::vector<float>, std::vector<int>, std::vector<unsigned char>>;
 
@@ -473,44 +475,47 @@ int MainImpl(const ExecuteNetworkParams& params,
             }
         }
 
-        // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
-        auto inference_duration = model.Run(inputDataContainers, outputDataContainers);
-
-        if (params.m_GenerateTensorData)
+        for (size_t x = 0; x < iterations; x++)
         {
-            ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
-        }
+            // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
+            auto inference_duration = model.Run(inputDataContainers, outputDataContainers);
 
-        // Print output tensors
-        const auto& infosOut = model.GetOutputBindingInfos();
-        for (size_t i = 0; i < numOutputs; i++)
-        {
-            const armnn::TensorInfo& infoOut = infosOut[i].second;
-            auto outputTensorFile = params.m_OutputTensorFiles.empty() ? "" : params.m_OutputTensorFiles[i];
-
-            TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
-                                  infoOut,
-                                  outputTensorFile,
-                                  params.m_DequantizeOutput);
-            boost::apply_visitor(printer, outputDataContainers[i]);
-        }
+            if (params.m_GenerateTensorData)
+            {
+                ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
+            }
 
-        ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
-                                << std::fixed << inference_duration.count() << " ms";
+            // Print output tensors
+            const auto& infosOut = model.GetOutputBindingInfos();
+            for (size_t i = 0; i < numOutputs; i++)
+            {
+                const armnn::TensorInfo& infoOut = infosOut[i].second;
+                auto outputTensorFile = params.m_OutputTensorFiles.empty() ? "" : params.m_OutputTensorFiles[i];
+
+                TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
+                                    infoOut,
+                                    outputTensorFile,
+                                    params.m_DequantizeOutput);
+                boost::apply_visitor(printer, outputDataContainers[i]);
+            }
 
-        // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
-        if (params.m_ThresholdTime != 0.0)
-        {
-            ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
-                                    << std::fixed << params.m_ThresholdTime << " ms";
-            auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
-            ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
-                                    << std::fixed << thresholdMinusInference << " ms" << "\n";
+            ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
+                                    << std::fixed << inference_duration.count() << " ms\n";
 
-            if (thresholdMinusInference < 0)
+            // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
+            if (params.m_ThresholdTime != 0.0)
             {
-                std::string errorMessage = "Elapsed inference time is greater than provided threshold time.";
-                ARMNN_LOG(fatal) << errorMessage;
+                ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
+                                        << std::fixed << params.m_ThresholdTime << " ms";
+                auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
+                ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
+                                        << std::fixed << thresholdMinusInference << " ms" << "\n";
+
+                if (thresholdMinusInference < 0)
+                {
+                    std::string errorMessage = "Elapsed inference time is greater than provided threshold time.";
+                    ARMNN_LOG(fatal) << errorMessage;
+                }
             }
         }
     }
@@ -545,6 +550,7 @@ int RunTest(const std::string& format,
             const size_t subgraphId,
             bool enableLayerDetails = false,
             bool parseUnsupported = false,
+            const size_t iterations = 1,
             const std::shared_ptr<armnn::IRuntime>& runtime = nullptr)
 {
     std::string modelFormat = armnn::stringUtils::StringTrimCopy(format);
@@ -682,34 +688,34 @@ int RunTest(const std::string& format,
     if (modelFormat.find("armnn") != std::string::npos)
     {
 #if defined(ARMNN_SERIALIZER)
-    return MainImpl<armnnDeserializer::IDeserializer, float>(params, runtime);
+        return MainImpl<armnnDeserializer::IDeserializer, float>(params, runtime, iterations);
 #else
         ARMNN_LOG(fatal) << "Not built with serialization support.";
-    return EXIT_FAILURE;
+        return EXIT_FAILURE;
 #endif
     }
     else if (modelFormat.find("caffe") != std::string::npos)
     {
 #if defined(ARMNN_CAFFE_PARSER)
-        return MainImpl<armnnCaffeParser::ICaffeParser, float>(params, runtime);
+        return MainImpl<armnnCaffeParser::ICaffeParser, float>(params, runtime, iterations);
 #else
         ARMNN_LOG(fatal) << "Not built with Caffe parser support.";
         return EXIT_FAILURE;
 #endif
     }
     else if (modelFormat.find("onnx") != std::string::npos)
-{
+    {
 #if defined(ARMNN_ONNX_PARSER)
-    return MainImpl<armnnOnnxParser::IOnnxParser, float>(params, runtime);
+        return MainImpl<armnnOnnxParser::IOnnxParser, float>(params, runtime, iterations);
 #else
         ARMNN_LOG(fatal) << "Not built with Onnx parser support.";
-    return EXIT_FAILURE;
+        return EXIT_FAILURE;
 #endif
     }
     else if (modelFormat.find("tensorflow") != std::string::npos)
     {
 #if defined(ARMNN_TF_PARSER)
-        return MainImpl<armnnTfParser::ITfParser, float>(params, runtime);
+        return MainImpl<armnnTfParser::ITfParser, float>(params, runtime, iterations);
 #else
         ARMNN_LOG(fatal) << "Not built with Tensorflow parser support.";
         return EXIT_FAILURE;
@@ -720,21 +726,21 @@ int RunTest(const std::string& format,
 #if defined(ARMNN_TF_LITE_PARSER)
         if (! isModelBinary)
         {
-            ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat << "'. Only 'binary' format supported \
-              for tflite files";
+            ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat
+                << "'. Only 'binary' format supported for tflite files";
             return EXIT_FAILURE;
         }
-        return MainImpl<armnnTfLiteParser::ITfLiteParser, float>(params, runtime);
+        return MainImpl<armnnTfLiteParser::ITfLiteParser, float>(params, runtime, iterations);
 #else
-        ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat <<
-            "'. Please include 'caffe', 'tensorflow', 'tflite' or 'onnx'";
+        ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat
+            << "'. Please include 'caffe', 'tensorflow', 'tflite' or 'onnx'";
         return EXIT_FAILURE;
 #endif
     }
     else
     {
-        ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat <<
-                                 "'. Please include 'caffe', 'tensorflow', 'tflite' or 'onnx'";
+        ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat
+            << "'. Please include 'caffe', 'tensorflow', 'tflite' or 'onnx'";
         return EXIT_FAILURE;
     }
 }
@@ -864,3 +870,57 @@ int RunCsvTest(const armnnUtils::CsvRow &csvRow, const std::shared_ptr<armnn::IR
                    dequantizeOutput, enableProfiling, enableFp16TurboMode, enableBf16TurboMode,
                    thresholdTime, printIntermediate, subgraphId, enableLayerDetails, parseUnuspported);
 }
+
+#if defined(ARMCOMPUTECL_ENABLED)
+int RunCLTuning(const std::string& tuningPath,
+            const int tuningLevel,
+            const std::string& modelFormat,
+            const std::string& inputTensorShapes,
+            const vector<armnn::BackendId>& computeDevices,
+            const std::string& dynamicBackendsPath,
+            const std::string& modelPath,
+            const std::string& inputNames,
+            const std::string& inputTensorDataFilePaths,
+            const std::string& inputTypes,
+            bool quantizeInput,
+            const std::string& outputTypes,
+            const std::string& outputNames,
+            const std::string& outputTensorFiles,
+            bool dequantizeOutput,
+            bool enableProfiling,
+            bool enableFp16TurboMode,
+            bool enableBf16TurboMode,
+            const double& thresholdTime,
+            bool printIntermediate,
+            const size_t subgraphId,
+            bool enableLayerDetails = false,
+            bool parseUnsupported = false)
+{
+    armnn::IRuntime::CreationOptions options;
+    options.m_BackendOptions.emplace_back(
+        armnn::BackendOptions
+        {
+            "GpuAcc",
+            {
+                {"TuningLevel", tuningLevel},
+                {"TuningFile", tuningPath.c_str()},
+                {"KernelProfilingEnabled", enableProfiling}
+            }
+        }
+    );
+
+    std::shared_ptr<armnn::IRuntime> runtime(armnn::IRuntime::Create(options));
+    const auto start_time = armnn::GetTimeNow();
+
+    ARMNN_LOG(info) << "Tuning run...\n";
+    int state = RunTest(modelFormat, inputTensorShapes, computeDevices, dynamicBackendsPath, modelPath, inputNames,
+                        inputTensorDataFilePaths, inputTypes, quantizeInput, outputTypes, outputNames,
+                        outputTensorFiles, dequantizeOutput, enableProfiling, enableFp16TurboMode, enableBf16TurboMode,
+                        thresholdTime, printIntermediate, subgraphId, enableLayerDetails, parseUnsupported, 1, runtime);
+
+    ARMNN_LOG(info) << "Tuning time: " << std::setprecision(2)
+                    << std::fixed << armnn::GetTimeDuration(start_time).count() << " ms\n";
+
+    return state;
+}
+#endif
\ No newline at end of file