From ab8022f74e65f6d4a33c8e64993adc87b0bfaa31 Mon Sep 17 00:00:00 2001 From: "Li, Peng" Date: Mon, 25 Jun 2018 22:06:18 +0800 Subject: [PATCH] update convolution opencl kernels in dnn module (#11762) * optimize ocl kernel enqueue in fc layer Signed-off-by: Li Peng * use CV_LOG_INFO in convolution auto tuning Signed-off-by: Li Peng * update convolution IDLF kernel extend parameter tuning range, also cleanup ocl kernel implementation Signed-off-by: Li Peng * update in-memory convolution cache config fp16 and fp32 cache config are stored separately Signed-off-by: Li Peng --- modules/dnn/src/layers/fully_connected_layer.cpp | 4 +- .../src/ocl4dnn/include/default_kernel_config.hpp | 1166 ++++++++++++++------ .../dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 192 ++-- modules/dnn/src/opencl/conv_layer_spatial.cl | 209 ++-- 4 files changed, 985 insertions(+), 586 deletions(-) diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 5152d60..499f672 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -310,7 +310,6 @@ public: innerProductOp = Ptr >(new OCL4DNNInnerProduct(config)); } - UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); for (size_t i = 0; i < inputs.size(); i++) { MatShape inshape, outshape; @@ -320,7 +319,6 @@ public: UMat srcMat, dstMat; srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]); dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]); - dstMat.setTo(0.0f); if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0], (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(), @@ -332,6 +330,7 @@ public: if (!use_half && bias && (outerSize > 1)) { + UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); UMat& biases = umat_blobs[1]; cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); } @@ -354,6 +353,7 @@ public: if (bias) { + UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); UMat& biases = umat_blobs[1]; cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); } diff --git a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp index 09e0c27..a25b2bf 100644 --- a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp +++ b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp @@ -1,23 +1,24 @@ #ifndef _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_ #define _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_ -const char *default_kernel_config_intel[] = { +const char *default_kernel_config_intel_fp32[] = { // Below is the information for OpenCL based on which these configurations tuned /******************************************************************************* Number of platforms 1 - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Platform Vendor Intel(R) Corporation - Platform Version OpenCL 2.0 + Platform Version OpenCL 2.1 Platform Profile FULL_PROFILE - Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation + Platform Host timer resolution 1ns Platform Extensions function suffix INTEL - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Number of devices 1 - Device Name Intel(R) HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO Device Vendor Intel(R) Corporation Device Vendor ID 0x8086 - Device Version OpenCL 2.0 - Driver Version r4.1.61547 + Device Version OpenCL 2.1 NEO + Driver Version 2018ww15-010713 Device OpenCL C Version OpenCL C 2.0 Device Type GPU Device Profile FULL_PROFILE @@ -25,11 +26,12 @@ Number of devices 1 Max clock frequency 950MHz Device Partition (core) Max number of sub-devices 0 - Supported partition types by (0x7F2F00000000) + Supported partition types None Max work item dimensions 3 Max work item sizes 256x256x256 Max work group size 256 Preferred work group size multiple 32 + Max sub-groups per work group 32 Preferred / native vector sizes char 16 / 16 short 8 / 8 @@ -66,15 +68,15 @@ Number of devices 1 Support is emulated in software No Correctly-rounded divide and sqrt operations No Address bits 64, Little-Endian - Global memory size 26888119911 (25.04GiB) + Global memory size 26892222464 (25.05GiB) Error Correction support No - Max memory allocation 4294959103 (4GiB) + Max memory allocation 4294959104 (4GiB) Unified memory for Host and Device Yes Shared Virtual Memory (SVM) capabilities (core) Coarse-grained buffer sharing Yes - Fine-grained buffer sharing Yes + Fine-grained buffer sharing No Fine-grained system sharing No - Atomics Yes + Atomics No Minimum alignment for any data type 128 bytes Alignment of base address 1024 bits (128 bytes) Preferred alignment for atomics @@ -82,13 +84,13 @@ Number of devices 1 Global 64 bytes Local 64 bytes Max size for global variable 65536 (64KiB) - Preferred total size of global vars 4294959103 (4GiB) + Preferred total size of global vars 4294959104 (4GiB) Global Memory cache type Read/Write Global Memory cache size 1572864 Global Memory cache line 64 bytes Image support Yes Max number of samplers per kernel 16 - Max size for 1D images from buffer 268434943 pixels + Max size for 1D images from buffer 268434944 pixels Max 1D or 2D image array size 2048 images Base address alignment for 2D image buffers 4 bytes Pitch alignment for 2D image buffers 4 bytes @@ -102,7 +104,7 @@ Number of devices 1 Max pipe packet size 1024 Local memory type Local Local memory size 65536 (64KiB) - Max constant buffer size 4294959103 (4GiB) + Max constant buffer size 4294959104 (4GiB) Max number of constant args 8 Max size of kernel argument 1024 Queue properties (on host) @@ -120,114 +122,171 @@ Number of devices 1 Execution capabilities Run OpenCL kernels Yes Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 SPIR versions 1.2 printf() buffer size 4194304 (4MiB) - Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; Motion Estimation accelerator version (Intel) 2 Device Available Yes Compiler Available Yes Linker Available Yes - Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation NULL platform behavior - clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform - clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform - clCreateContext(NULL, ...) [default] No platform - clCreateContext(NULL, ...) [other] Success [INTEL] - clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No platform - clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) No platform - clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No platform - clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform - clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) Intel(R) OpenCL HD Graphics + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [INTEL] + clCreateContext(NULL, ...) [default] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + +ICD loader properties + ICD loader Name OpenCL ICD Loader + ICD loader Vendor OCL Icd free software + ICD loader Version 2.2.8 + ICD loader Profile OpenCL 1.2 + NOTE: your OpenCL library declares to support OpenCL 1.2, + but it seems to support up to OpenCL 2.1 too. ********************************************************************************/ -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","5 5 16 2 1 1 16 1 0 ", -"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","6 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 10 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","2 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0","2 5 16 2 1 1 16 1 0 ", -"EU72_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","3 7 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0","6 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 5 16 2 1 1 16 1 0 ", -"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","5 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0","2 10 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0","2 7 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","2 7 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","2 6 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0","1 16 32 5 1 16 1 1 0 ", -"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","5 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0","1 16 32 5 1 16 1 1 0 ", -"EU72_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0","2 5 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU72_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0","9 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","14 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","3 6 16 2 1 1 16 1 0 ", -"EU72_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","3 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","7 4 8 2 1 1 8 1 0 ", -"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0","3 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","10 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0","2 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1","2 8 32 5 1 8 1 1 0 ", + +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "6 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "2 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ5_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ5_eltwise0_FP32", "12 2 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ5_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "1 3 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "2 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "2 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "3 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ5_eltwise0_FP32", "2 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ5_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ5_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn32_g1_s1x1_d1x1_b1_in160x160_p0x0_num1_M64_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn32_g1_s1x1_d1x1_b1_in160x160_p0x0_num1_M64_activ5_eltwise0_FP32", "1 16 32 5 1 16 1 1 0", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP32", "7 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "8 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "8 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "3 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M12_activ0_eltwise0_FP32", "6 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M273_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ5_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M63_activ0_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "8 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP32", "7 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k3x3_cn1024_g1024_s1x1_d1x1_b1_in16x16_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn1024_g1024_s1x1_d1x1_b1_in16x16_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn128_g128_s1x1_d1x1_b1_in80x80_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn128_g128_s1x1_d1x1_b1_in80x80_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn128_g128_s2x2_d1x1_b1_in80x80_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn128_g128_s2x2_d1x1_b1_in80x80_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn128_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M256_activ5_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU72_k3x3_cn128_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn256_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ5_eltwise0_FP32", "3 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn256_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn256_g256_s1x1_d1x1_b1_in48x48_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn256_g256_s1x1_d1x1_b1_in48x48_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn256_g256_s2x2_d1x1_b1_in48x48_p0x0_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn256_g256_s2x2_d1x1_b1_in48x48_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn32_g32_s1x1_d1x1_b1_in160x160_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn32_g32_s1x1_d1x1_b1_in160x160_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p0x0_num1_M32_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M32_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k3x3_cn512_g512_s1x1_d1x1_b1_in32x32_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn512_g512_s1x1_d1x1_b1_in32x32_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn512_g512_s2x2_d1x1_b1_in32x32_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn512_g512_s2x2_d1x1_b1_in32x32_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn64_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU72_k3x3_cn64_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn64_g64_s2x2_d1x1_b1_in160x160_p0x0_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn64_g64_s2x2_d1x1_b1_in160x160_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP32", "4 2 8 2 1 1 8 1 0", +"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP32", "6 1 16 2 1 1 16 1 0", +"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "6 1 16 2 1 1 16 1 0", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP32", "4 3 16 2 1 1 16 1 0", // Below is the information for OpenCL based on which these configurations tuned /******************************************************************************* Number of platforms 1 - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Platform Vendor Intel(R) Corporation - Platform Version OpenCL 2.0 + Platform Version OpenCL 2.1 Platform Profile FULL_PROFILE - Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation + Platform Host timer resolution 1ns Platform Extensions function suffix INTEL - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Number of devices 1 - Device Name Intel(R) HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO Device Vendor Intel(R) Corporation Device Vendor ID 0x8086 - Device Version OpenCL 2.0 - Driver Version r4.1.61547 + Device Version OpenCL 2.1 NEO + Driver Version 18.21.10858 Device OpenCL C Version OpenCL C 2.0 Device Type GPU Device Profile FULL_PROFILE @@ -235,11 +294,12 @@ Number of devices 1 Max clock frequency 950MHz Device Partition (core) Max number of sub-devices 0 - Supported partition types by (0x7F2200000000) + Supported partition types None Max work item dimensions 3 Max work item sizes 256x256x256 Max work group size 256 Preferred work group size multiple 32 + Max sub-groups per work group 32 Preferred / native vector sizes char 16 / 16 short 8 / 8 @@ -276,9 +336,9 @@ Number of devices 1 Support is emulated in software No Correctly-rounded divide and sqrt operations No Address bits 64, Little-Endian - Global memory size 13361912218 (12.44GiB) + Global memory size 13364170752 (12.45GiB) Error Correction support No - Max memory allocation 4294959103 (4GiB) + Max memory allocation 4294959104 (4GiB) Unified memory for Host and Device Yes Shared Virtual Memory (SVM) capabilities (core) Coarse-grained buffer sharing Yes @@ -292,13 +352,13 @@ Number of devices 1 Global 64 bytes Local 64 bytes Max size for global variable 65536 (64KiB) - Preferred total size of global vars 4294959103 (4GiB) + Preferred total size of global vars 4294959104 (4GiB) Global Memory cache type Read/Write Global Memory cache size 1048576 Global Memory cache line 64 bytes Image support Yes Max number of samplers per kernel 16 - Max size for 1D images from buffer 268434943 pixels + Max size for 1D images from buffer 268434944 pixels Max 1D or 2D image array size 2048 images Base address alignment for 2D image buffers 4 bytes Pitch alignment for 2D image buffers 4 bytes @@ -312,7 +372,7 @@ Number of devices 1 Max pipe packet size 1024 Local memory type Local Local memory size 65536 (64KiB) - Max constant buffer size 4294959103 (4GiB) + Max constant buffer size 4294959104 (4GiB) Max number of constant args 8 Max size of kernel argument 1024 Queue properties (on host) @@ -330,14 +390,16 @@ Number of devices 1 Execution capabilities Run OpenCL kernels Yes Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 SPIR versions 1.2 printf() buffer size 4194304 (4MiB) - Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; Motion Estimation accelerator version (Intel) 2 Device Available Yes Compiler Available Yes Linker Available Yes - Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation NULL platform behavior clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform @@ -350,106 +412,155 @@ NULL platform behavior clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform ********************************************************************************/ -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","13 1 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1","1 8 32 5 1 8 1 1 0 ", -"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0","2 5 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0","1 16 32 5 1 16 1 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","7 4 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","14 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0","1 16 32 5 1 16 1 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 10 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0","4 4 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","3 7 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","4 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0","6 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","9 3 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","7 4 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","3 2 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1","2 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 10 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","2 3 16 2 1 1 16 1 0 ", -"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","3 5 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0","3 7 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","7 4 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ5_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "3 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "2 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ5_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn32_g1_s1x1_d1x1_b1_in160x160_p0x0_num1_M64_activ1_eltwise0_FP32", "1 16 32 5 1 16 1 1 0", +"EU48_k1x1_cn32_g1_s1x1_d1x1_b1_in160x160_p0x0_num1_M64_activ5_eltwise0_FP32", "1 16 32 5 1 16 1 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "4 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M12_activ0_eltwise0_FP32", "9 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M273_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M63_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "7 2 8 2 1 1 8 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "4 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP32", "4 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn1024_g1024_s1x1_d1x1_b1_in16x16_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn1024_g1024_s1x1_d1x1_b1_in16x16_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g128_s1x1_d1x1_b1_in80x80_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn128_g128_s1x1_d1x1_b1_in80x80_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn128_g128_s2x2_d1x1_b1_in80x80_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn128_g128_s2x2_d1x1_b1_in80x80_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M256_activ5_eltwise0_FP32", "2 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn256_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ5_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn256_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn256_g256_s1x1_d1x1_b1_in48x48_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn256_g256_s1x1_d1x1_b1_in48x48_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn256_g256_s2x2_d1x1_b1_in48x48_p0x0_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn256_g256_s2x2_d1x1_b1_in48x48_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn32_g32_s1x1_d1x1_b1_in160x160_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn32_g32_s1x1_d1x1_b1_in160x160_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p0x0_num1_M32_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M32_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k3x3_cn512_g512_s1x1_d1x1_b1_in32x32_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn512_g512_s1x1_d1x1_b1_in32x32_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn512_g512_s2x2_d1x1_b1_in32x32_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn512_g512_s2x2_d1x1_b1_in32x32_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU48_k3x3_cn64_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn64_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn64_g64_s2x2_d1x1_b1_in160x160_p0x0_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn64_g64_s2x2_d1x1_b1_in160x160_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP32", "4 2 8 2 1 1 8 1 0", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP32", "4 3 8 2 1 1 8 1 0", +"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "4 2 8 2 1 1 8 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP32", "4 7 8 2 1 1 8 1 0", +"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP32", "5 4 16 2 1 1 16 1 0", +"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "13 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0_FP32", "3 9 8 2 1 1 8 1 0", // Below is the information for OpenCL based on which these configurations tuned /******************************************************************************* Number of platforms 1 - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Platform Vendor Intel(R) Corporation - Platform Version OpenCL 2.0 + Platform Version OpenCL 2.1 Platform Profile FULL_PROFILE - Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_fp64 cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation cl_intel_va_api_media_sharing + Platform Host timer resolution 1ns Platform Extensions function suffix INTEL - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Number of devices 1 - Device Name Intel(R) HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO Device Vendor Intel(R) Corporation Device Vendor ID 0x8086 - Device Version OpenCL 2.0 - Driver Version 16.5.59288 + Device Version OpenCL 2.1 NEO + Driver Version 18.23.10915 Device OpenCL C Version OpenCL C 2.0 Device Type GPU Device Profile FULL_PROFILE Max compute units 24 - Max clock frequency 1050MHz + Max clock frequency 1150MHz Device Partition (core) Max number of sub-devices 0 - Supported partition types by (0x7FC300000000) + Supported partition types None Max work item dimensions 3 Max work item sizes 256x256x256 Max work group size 256 Preferred work group size multiple 32 + Max sub-groups per work group 32 Preferred / native vector sizes char 16 / 16 short 8 / 8 @@ -486,9 +597,9 @@ Number of devices 1 Support is emulated in software No Correctly-rounded divide and sqrt operations No Address bits 64, Little-Endian - Global memory size 6588809216 (6.136GiB) + Global memory size 6575288320 (6.124GiB) Error Correction support No - Max memory allocation 3294404608 (3.068GiB) + Max memory allocation 3287644160 (3.062GiB) Unified memory for Host and Device Yes Shared Virtual Memory (SVM) capabilities (core) Coarse-grained buffer sharing Yes @@ -502,13 +613,241 @@ Number of devices 1 Global 64 bytes Local 64 bytes Max size for global variable 65536 (64KiB) - Preferred total size of global vars 3294404608 (3.068GiB) + Preferred total size of global vars 3287644160 (3.062GiB) Global Memory cache type Read/Write Global Memory cache size 524288 Global Memory cache line 64 bytes Image support Yes Max number of samplers per kernel 16 - Max size for 1D images from buffer 205900288 pixels + Max size for 1D images from buffer 205477760 pixels + Max 1D or 2D image array size 2048 images + Base address alignment for 2D image buffers 4 bytes + Pitch alignment for 2D image buffers 4 bytes + Max 2D image size 16384x16384 pixels + Max 3D image size 16384x16384x2048 pixels + Max number of read image args 128 + Max number of write image args 128 + Max number of read/write image args 128 + Max number of pipe args 16 + Max active pipe reservations 1 + Max pipe packet size 1024 + Local memory type Local + Local memory size 65536 (64KiB) + Max constant buffer size 3287644160 (3.062GiB) + Max number of constant args 8 + Max size of kernel argument 1024 + Queue properties (on host) + Out-of-order execution Yes + Profiling Yes + Queue properties (on device) + Out-of-order execution Yes + Profiling Yes + Preferred size 131072 (128KiB) + Max size 67108864 (64MiB) + Max queues on device 1 + Max events on device 1024 + Prefer user sync for interop Yes + Profiling timer resolution 83ns + Execution capabilities + Run OpenCL kernels Yes + Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 + SPIR versions 1.2 + printf() buffer size 4194304 (4MiB) + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; + Motion Estimation accelerator version (Intel) 2 + Device Available Yes + Compiler Available Yes + Linker Available Yes + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_fp64 cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation cl_intel_va_api_media_sharing + +NULL platform behavior + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) Intel(R) OpenCL HD Graphics + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [INTEL] + clCreateContext(NULL, ...) [default] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + +ICD loader properties + ICD loader Name OpenCL ICD Loader + ICD loader Vendor OCL Icd free software + ICD loader Version 2.2.8 + ICD loader Profile OpenCL 1.2 + NOTE: your OpenCL library declares to support OpenCL 1.2, + but it seems to support up to OpenCL 2.1 too. +********************************************************************************/ +"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32", "2 5 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0_FP32", "7 4 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "10 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0_FP32", "7 3 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1_FP32", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP32", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0_FP32", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP32", "4 3 8 2 1 1 8 1 0", +"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "3 7 8 2 1 1 8 1 0", +"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0_FP32", "9 3 16 2 1 1 16 1 0", +"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP32", "4 4 16 2 1 1 16 1 0", +}; + +const char *default_kernel_config_intel_fp16[] = { +// Below is the information for OpenCL based on which these configurations tuned +/******************************************************************************* +Number of platforms 1 + Platform Name Intel(R) OpenCL HD Graphics + Platform Vendor Intel(R) Corporation + Platform Version OpenCL 2.1 + Platform Profile FULL_PROFILE + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation + Platform Host timer resolution 1ns + Platform Extensions function suffix INTEL + + Platform Name Intel(R) OpenCL HD Graphics +Number of devices 1 + Device Name Intel(R) Gen9 HD Graphics NEO + Device Vendor Intel(R) Corporation + Device Vendor ID 0x8086 + Device Version OpenCL 2.1 NEO + Driver Version 18.21.10858 + Device OpenCL C Version OpenCL C 2.0 + Device Type GPU + Device Profile FULL_PROFILE + Max compute units 48 + Max clock frequency 950MHz + Device Partition (core) + Max number of sub-devices 0 + Supported partition types None + Max work item dimensions 3 + Max work item sizes 256x256x256 + Max work group size 256 + Preferred work group size multiple 32 + Max sub-groups per work group 32 + Preferred / native vector sizes + char 16 / 16 + short 8 / 8 + int 4 / 4 + long 1 / 1 + half 8 / 8 (cl_khr_fp16) + float 1 / 1 + double 1 / 1 (cl_khr_fp64) + Half-precision Floating-point support (cl_khr_fp16) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Single-precision Floating-point support (core) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations Yes + Double-precision Floating-point support (cl_khr_fp64) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Address bits 64, Little-Endian + Global memory size 13364170752 (12.45GiB) + Error Correction support No + Max memory allocation 4294959104 (4GiB) + Unified memory for Host and Device Yes + Shared Virtual Memory (SVM) capabilities (core) + Coarse-grained buffer sharing Yes + Fine-grained buffer sharing No + Fine-grained system sharing No + Atomics No + Minimum alignment for any data type 128 bytes + Alignment of base address 1024 bits (128 bytes) + Preferred alignment for atomics + SVM 64 bytes + Global 64 bytes + Local 64 bytes + Max size for global variable 65536 (64KiB) + Preferred total size of global vars 4294959104 (4GiB) + Global Memory cache type Read/Write + Global Memory cache size 1048576 + Global Memory cache line 64 bytes + Image support Yes + Max number of samplers per kernel 16 + Max size for 1D images from buffer 268434944 pixels Max 1D or 2D image array size 2048 images Base address alignment for 2D image buffers 4 bytes Pitch alignment for 2D image buffers 4 bytes @@ -522,7 +861,7 @@ Number of devices 1 Max pipe packet size 1024 Local memory type Local Local memory size 65536 (64KiB) - Max constant buffer size 3294404608 (3.068GiB) + Max constant buffer size 4294959104 (4GiB) Max number of constant args 8 Max size of kernel argument 1024 Queue properties (on host) @@ -540,14 +879,16 @@ Number of devices 1 Execution capabilities Run OpenCL kernels Yes Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 SPIR versions 1.2 printf() buffer size 4194304 (4MiB) - Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; Motion Estimation accelerator version (Intel) 2 Device Available Yes Compiler Available Yes Linker Available Yes - Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation NULL platform behavior clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform @@ -560,153 +901,282 @@ NULL platform behavior clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform ********************************************************************************/ -"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32_activ2_eltwise0","8 3 16 2 1 1 16 1 0 ", -"EU24_k2x2_cn16_g1_s2x2_d1x1_b1_in256x256_p0x0_num1_M16_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M16_activ2_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5_activ0_eltwise0","2 4 8 2 1 1 8 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32_activ2_eltwise0","10 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4_activ2_eltwise0","2 8 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k2x2_cn64_g1_s2x2_d1x1_b1_in128x128_p0x0_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn4_g1_s1x1_d1x1_b1_in256x256_p0x0_num1_M16_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","8 2 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M16_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4_activ0_eltwise0","1 1 1 4 1 1 1 0 1 ", -"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16_activ2_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 6 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M4_activ2_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4_activ0_eltwise0","4 4 8 2 1 1 8 1 0 ", -"EU24_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64_activ0_eltwise0","4 1 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M16_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32_activ1_eltwise0","4 7 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn16_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M64_activ3_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0","2 8 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn16_g1_s1x1_d1x1_b1_in256x256_p0x0_num1_M4_activ2_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M32_activ2_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn32_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M128_activ3_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn16_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M64_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU24_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2_activ0_eltwise0","1 3 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13_activ0_eltwise0","1 1 1 4 1 1 1 0 1 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","4 3 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1","4 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP16", "11 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP16", "8 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP16", "7 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "8 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "6 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP16", "1 16 32 5 1 16 1 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "4 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP16", "7 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP16", "5 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP16", "5 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP16", "8 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP16", "10 2 16 2 1 1 16 1 0", +"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP16", "5 1 16 2 1 1 16 1 0", +"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP16", "5 6 16 2 1 1 16 1 0", +"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP16", "2 8 16 2 1 1 16 1 0", +"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0_FP16", "9 2 16 2 1 1 16 1 0", +// Below is the information for OpenCL based on which these configurations tuned +/******************************************************************************* +Number of platforms 1 + Platform Name Intel(R) OpenCL HD Graphics + Platform Vendor Intel(R) Corporation + Platform Version OpenCL 2.1 + Platform Profile FULL_PROFILE + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_fp64 cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation cl_intel_va_api_media_sharing + Platform Host timer resolution 1ns + Platform Extensions function suffix INTEL + + Platform Name Intel(R) OpenCL HD Graphics +Number of devices 1 + Device Name Intel(R) Gen9 HD Graphics NEO + Device Vendor Intel(R) Corporation + Device Vendor ID 0x8086 + Device Version OpenCL 2.1 NEO + Driver Version 18.23.10915 + Device OpenCL C Version OpenCL C 2.0 + Device Type GPU + Device Profile FULL_PROFILE + Max compute units 24 + Max clock frequency 1150MHz + Device Partition (core) + Max number of sub-devices 0 + Supported partition types None + Max work item dimensions 3 + Max work item sizes 256x256x256 + Max work group size 256 + Preferred work group size multiple 32 + Max sub-groups per work group 32 + Preferred / native vector sizes + char 16 / 16 + short 8 / 8 + int 4 / 4 + long 1 / 1 + half 8 / 8 (cl_khr_fp16) + float 1 / 1 + double 1 / 1 (cl_khr_fp64) + Half-precision Floating-point support (cl_khr_fp16) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Single-precision Floating-point support (core) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations Yes + Double-precision Floating-point support (cl_khr_fp64) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Address bits 64, Little-Endian + Global memory size 6575288320 (6.124GiB) + Error Correction support No + Max memory allocation 3287644160 (3.062GiB) + Unified memory for Host and Device Yes + Shared Virtual Memory (SVM) capabilities (core) + Coarse-grained buffer sharing Yes + Fine-grained buffer sharing No + Fine-grained system sharing No + Atomics No + Minimum alignment for any data type 128 bytes + Alignment of base address 1024 bits (128 bytes) + Preferred alignment for atomics + SVM 64 bytes + Global 64 bytes + Local 64 bytes + Max size for global variable 65536 (64KiB) + Preferred total size of global vars 3287644160 (3.062GiB) + Global Memory cache type Read/Write + Global Memory cache size 524288 + Global Memory cache line 64 bytes + Image support Yes + Max number of samplers per kernel 16 + Max size for 1D images from buffer 205477760 pixels + Max 1D or 2D image array size 2048 images + Base address alignment for 2D image buffers 4 bytes + Pitch alignment for 2D image buffers 4 bytes + Max 2D image size 16384x16384 pixels + Max 3D image size 16384x16384x2048 pixels + Max number of read image args 128 + Max number of write image args 128 + Max number of read/write image args 128 + Max number of pipe args 16 + Max active pipe reservations 1 + Max pipe packet size 1024 + Local memory type Local + Local memory size 65536 (64KiB) + Max constant buffer size 3287644160 (3.062GiB) + Max number of constant args 8 + Max size of kernel argument 1024 + Queue properties (on host) + Out-of-order execution Yes + Profiling Yes + Queue properties (on device) + Out-of-order execution Yes + Profiling Yes + Preferred size 131072 (128KiB) + Max size 67108864 (64MiB) + Max queues on device 1 + Max events on device 1024 + Prefer user sync for interop Yes + Profiling timer resolution 83ns + Execution capabilities + Run OpenCL kernels Yes + Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 + SPIR versions 1.2 + printf() buffer size 4194304 (4MiB) + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; + Motion Estimation accelerator version (Intel) 2 + Device Available Yes + Compiler Available Yes + Linker Available Yes + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_fp64 cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation cl_intel_va_api_media_sharing + +NULL platform behavior + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) Intel(R) OpenCL HD Graphics + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [INTEL] + clCreateContext(NULL, ...) [default] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + +ICD loader properties + ICD loader Name OpenCL ICD Loader + ICD loader Vendor OCL Icd free software + ICD loader Version 2.2.8 + ICD loader Profile OpenCL 1.2 + NOTE: your OpenCL library declares to support OpenCL 1.2, + but it seems to support up to OpenCL 2.1 too. +********************************************************************************/ +"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP16", "2 7 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0_FP16", "7 4 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP16", "10 3 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP16", "10 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0_FP16", "8 4 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP16", "10 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "9 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0_FP16", "8 3 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0_FP16", "7 3 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "8 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "6 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP16", "10 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP16", "13 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP16", "4 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0_FP16", "7 3 16 2 1 1 16 1 0", +"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP16", "4 7 16 2 1 1 16 1 0", }; #endif diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index b5699b2..034f8d3 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -55,6 +55,7 @@ #include "../include/math_functions.hpp" #include "../include/default_kernel_config.hpp" #include "opencv2/dnn/shape_utils.hpp" +#include "opencv2/core/utils/logger.hpp" #if defined WIN32 || defined _WIN32 #include @@ -87,10 +88,13 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path) { CV_Assert(defaultConfigLoaded == false); CV_Assert(kernelConfigMap.empty()); - const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2; + + /* fp32 config */ + size_t numConfigs = sizeof(default_kernel_config_intel_fp32) / + sizeof(default_kernel_config_intel_fp32[0]) / 2; for (size_t i = 0; i < numConfigs; i++) { - std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i]; + std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i]; if (!cache_path.empty()) { std::string cacheFile = cache_path + sanitize(key); @@ -100,9 +104,29 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path) } std::pair entry( key, - default_kernel_config_intel[2 * i + 1]); + default_kernel_config_intel_fp32[2 * i + 1]); kernelConfigMap.insert(entry); } + + /* fp16 config */ + numConfigs = sizeof(default_kernel_config_intel_fp16) / + sizeof(default_kernel_config_intel_fp16[0]) / 2; + for (size_t i = 0; i < numConfigs; i++) + { + std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i]; + if (!cache_path.empty()) + { + std::string cacheFile = cache_path + sanitize(key); + std::ifstream cachedKernel(cacheFile.c_str()); + if (cachedKernel) + continue; // external configuration found, skip builtin + } + std::pair entry( + key, + default_kernel_config_intel_fp16[2 * i + 1]); + kernelConfigMap.insert(entry); + } + defaultConfigLoaded = true; } @@ -311,40 +335,38 @@ void OCL4DNNConvSpatial::setupKernelDetails(int32_t kernelType, // options options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_; + options_ << " -cl-mad-enable"; if (clOptionSupport("-cl-no-subgroup-ifp")) options_ << " -cl-no-subgroup-ifp "; // defs - int32_t output_width = output_w_; - int32_t output_height = output_h_; int32_t output_block_width = blockM; int32_t output_block_height = blockK; - const int32_t last_block_width = (output_width % output_block_width == 0) ? - output_block_width : output_width % output_block_width; - const int32_t last_block_height = (output_height % output_block_height == 0) ? - output_block_height : output_height % output_block_height; - int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4); - int tile_y = (output_block_height -1) * stride_h_ + kernel_h_ * dilation_h_; - int tile_y_stride = (4 * simd_size) / tile_x; - int invec_size = divUp(tile_y, tile_y_stride); + int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_; + int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_; + int invec_size = tile_y; addDef("SIMD_SIZE", simd_size); - addDef("filter_qualifier", "__global"); addDef("OUT_BLOCK_WIDTH", output_block_width); addDef("OUT_BLOCK_HEIGHT", output_block_height); - addDef("LAST_BLOCK_WIDTH", last_block_width); - addDef("LAST_BLOCK_HEIGHT", last_block_height); addDef("INPUT_DEPTH", channels_ / group_); addDef("TOTAL_INPUT_DEPTH_SIZE", channels_); addDef("TOTAL_OUTPUT_DEPTH", num_output_); addDef("NUM_FILTERS", M_); addDef("TILE_X", tile_x); addDef("TILE_Y", tile_y); - addDef("TILE_Y_STRIDE", tile_y_stride); addDef("INVEC_SIZE", invec_size); addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size)); addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height)); addDef("APPLY_BIAS", bias_term_); + addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8); + addDef("INPUT_PITCH", (width_ * height_)); + addDef("OUTPUT_PITCH", (output_w_ * output_h_)); + addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_)); + addDef("INPUT_WIDTH", width_); + addDef("INPUT_HEIGHT", height_); + addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size)); + setFusionDefine(fused_activ_, fused_eltwise_); src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; @@ -567,13 +589,6 @@ void OCL4DNNConvSpatial::calculateBenchmark(const UMat &bottom, UMat &ver return; } -#define dbg -#ifdef dbg -#define dbgPrint(x) (x) -#else -#define dbgPrint(x) -#endif - // For large enough input size, we do not need to tune kernels for different // size. The reason is with large input size, there will be enough work items // to feed al the EUs. @@ -584,6 +599,7 @@ void OCL4DNNConvSpatial::calculateBenchmark(const UMat &bottom, UMat &ver template void OCL4DNNConvSpatial::generateKey() { + std::string precision = (use_half_) ? "FP16" : "FP32"; std::stringstream keyBuilder; // FIXME: to support fuse? keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_" @@ -597,7 +613,8 @@ void OCL4DNNConvSpatial::generateKey() << "num" << num_ << "_" << "M" << M_ << "_" << "activ" << fused_activ_ << "_" - << "eltwise" << fused_eltwise_; + << "eltwise" << fused_eltwise_ << "_" + << precision; key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str(); @@ -616,11 +633,6 @@ std::string OCL4DNNConvSpatial::generateSpecificKey(int32_t type, int32_t << "_" << blockHeight << "_" << blockDepth; - if (!use_half_) - keyBuilder << "_float"; - else - keyBuilder << "_half"; - return keyBuilder.str(); } @@ -1164,7 +1176,7 @@ float OCL4DNNConvSpatial::timedConvolve(const UMat &bottom, UMat &top, cv::ocl::Timer timer(queue); timer.start(); bool res = true;; - dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl); + CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName); tuned_ = true; int loop_cnt = 4; for (int i = 0; i < loop_cnt; i++) { @@ -1181,7 +1193,6 @@ float OCL4DNNConvSpatial::timedConvolve(const UMat &bottom, UMat &top, } float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt; - #ifdef dbg double out_w = output_w_; double out_h = output_h_; double out_z = M_; @@ -1189,16 +1200,8 @@ float OCL4DNNConvSpatial::timedConvolve(const UMat &bottom, UMat &top, double k_h = kernel_h_; double k_z = channels_; double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_; - std::cout << "\tEstimated Gflops:" << (totalFlops * 1e-9) - << std::endl; - std::cout << "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)) - << std::endl; - #if 0 - std::cout << "Estimated utilization: " << - ((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0 - << std::endl; - #endif - #endif + CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9)); + CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))); return elapsedTime; } @@ -1254,18 +1257,18 @@ bool OCL4DNNConvSpatial::verifyResult(const UMat &bottom, if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) { - dbgPrint(printf("test verification failed @ image %d group %d" - "out_ch %d h %d w %d got %G expected %G\n", - n, g, out_ch, h, w, data[offset], verify_data[offset])); + CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g + << " out_ch " << out_ch << " h " << h << " w " << w + << " got " << data[offset] << " expected " << verify_data[offset]); verificationFail = 1; goto out; } else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) { - dbgPrint(printf("test verification failed @ image %d group %d" - "out_ch %d h %d w %d got %G expected %G\n", - n, g, out_ch, h, w, data[offset], verify_data[offset])); + CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g + << " out_ch " << out_ch << " h " << h << " w " << w + << " got " << data[offset] << " expected " << verify_data[offset]); verificationFail = 1; goto out; } @@ -1546,17 +1549,11 @@ void OCL4DNNConvSpatial::generate_idlf_tuneritems(std::vector< cv::Ptr (4 * simd_size)) - return; - - if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max) + int tile_x = alignSize(actual_tile_x, simd_size); + if (tile_x > simd_size) return; - int tile_y_stride = (4 * simd_size) / tile_x; - int invec_size = divUp(tile_y, tile_y_stride); - if (invec_size > 4) + if (blockM * blockK > block_size_max) return; tunerItems.push_back(makePtr(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size)); @@ -1599,11 +1596,7 @@ void OCL4DNNConvSpatial::generateTunerItems(std::vector< cv::Ptr 0; height--) { generate_idlf_tuneritems(tunerItems, width, height, simd_size); - if (tunerItems.size() >= 8 && height == 2) - break; } - if (tunerItems.size() >= 12 && width == 2) - break; } } } @@ -1690,35 +1683,31 @@ void OCL4DNNConvSpatial::setupConvolution(const UMat &bottom, if (kernelQueue[x]->tested == false) { bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop); if (verified == false) { - dbgPrint(std::cout << "Kernel " - << kernelQueue[x]->kernelName - << " failed verification" << std::endl); - dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: " - << kernelQueue[x]->workItem_output[0] << " " - << "kernelQueue[x]->workItem_output[1]: " - << kernelQueue[x]->workItem_output[1] << " " - << "kernelQueue[x]->workItem_output[2]: " - << kernelQueue[x]->workItem_output[2] << " " - << "kernelQueue[x]->kernelType: " - << kernelQueue[x]->kernelType << " " - << "kernelQueue[x]->global_work_size[0]: " - << kernelQueue[x]->global_work_size[0] << " " - << "kernelQueue[x]->global_work_size[1]: " - << kernelQueue[x]->global_work_size[1] << " " - << "kernelQueue[x]->global_work_size[2]: " - << kernelQueue[x]->global_work_size[2] << " " - << "kernelQueue[x]->local_work_size[0]: " - << kernelQueue[x]->local_work_size[0] << " " - << "kernelQueue[x]->local_work_size[1]: " - << kernelQueue[x]->local_work_size[1] << " " - << "kernelQueue[x]->local_work_size[2]: " - << kernelQueue[x]->local_work_size[2] << " " - << kernelQueue[x]->swizzle_weights << " " - << kernelQueue[x]->use_null_local << std::endl); + CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification"); + CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: " + << kernelQueue[x]->workItem_output[0] << " " + << "kernelQueue[x]->workItem_output[1]: " + << kernelQueue[x]->workItem_output[1] << " " + << "kernelQueue[x]->workItem_output[2]: " + << kernelQueue[x]->workItem_output[2] << " " + << "kernelQueue[x]->kernelType: " + << kernelQueue[x]->kernelType << " " + << "kernelQueue[x]->global_work_size[0]: " + << kernelQueue[x]->global_work_size[0] << " " + << "kernelQueue[x]->global_work_size[1]: " + << kernelQueue[x]->global_work_size[1] << " " + << "kernelQueue[x]->global_work_size[2]: " + << kernelQueue[x]->global_work_size[2] << " " + << "kernelQueue[x]->local_work_size[0]: " + << kernelQueue[x]->local_work_size[0] << " " + << "kernelQueue[x]->local_work_size[1]: " + << kernelQueue[x]->local_work_size[1] << " " + << "kernelQueue[x]->local_work_size[2]: " + << kernelQueue[x]->local_work_size[2] << " " + << kernelQueue[x]->swizzle_weights << " " + << kernelQueue[x]->use_null_local); } else { - dbgPrint(std::cout << "Kernel " - << kernelQueue[x]->kernelName - << " pass verification" << std::endl); + CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification"); } } #endif @@ -1747,19 +1736,28 @@ void OCL4DNNConvSpatial::setupConvolution(const UMat &bottom, break; } else { kernelQueue[fastestKernel]->tested = true; - dbgPrint(std::cout << "Kernel " << - kernelQueue[fastestKernel]->kernelName << - " failed verification" << std::endl); + CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName << + " failed verification"); failures++; } } } if (verification) { - dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName << - "> passed verification" << std::endl); - dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl); + CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName << + "> passed verification"); + CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime); + double out_w = output_w_; + double out_h = output_h_; + double out_z = M_; + double k_w = kernel_w_; + double k_h = kernel_h_; + double k_z = channels_; + float elapsedTime = kernelQueue[kernel_index_]->executionTime; + double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_; + CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9)); + CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))); } else { - dbgPrint(std::cout << "fallback to basic kernel" << std::endl); + CV_LOG_INFO(NULL, "fallback to basic kernel"); options_.str(""); options_.clear(); // clear contents and state flags createBasicKernel(1, 1, 1); kernel_index_ = kernelQueue.size() - 1; @@ -1827,7 +1825,7 @@ void OCL4DNNConvSpatial::prepareKernel(const UMat &bottom, UMat &top, if (loadCachedConfig()) // check in-memory cache return; - if (loadTunedConfig()) // check external storage + if (loadTunedConfig()) // check external storage return; UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1); diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 621ab6f..dc7b047 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -206,8 +206,6 @@ __kernel void ConvolveBasic( #elif defined KERNEL_IDLF -#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0) - // Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map. // Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image. // NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH @@ -219,190 +217,123 @@ __kernel void convolve_simd( ELTWISE_DATA_ARG FUSED_ARG - __global Dtype* inputs_base, - filter_qualifier Dtype* weights_base, + __global Dtype* inputs, + __global Dtype* weights, BIAS_KERNEL_ARG - __global Dtype* outputs_base, + __global Dtype* outputs, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height) { - __global Dtype* outputs = outputs_base; - __global Dtype* inputs = inputs_base; - filter_qualifier Dtype* weights = weights_base; unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column - unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT;// or = Output Row - unsigned int fm = get_global_id(2);// fm = Feature Map = od = Output Depth + unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row + unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth unsigned int fmg = get_group_id(2); unsigned int lid = get_local_id(2); - Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT]; - - int in_addr; + Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f }; // find weights address of given neuron (lid is index) - unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; + unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) * + INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; - for(int i=0;i= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { - if (curr_x < INPUT_PAD_W) { - in_buf.in_vec[reg].s0 = 0; - if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); - else - in_buf.in_vec[reg].s1 = 0; - if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2); - else - in_buf.in_vec[reg].s2 = 0; - if (curr_x + 3 < input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); - else - in_buf.in_vec[reg].s3 = 0; - } else { - VLOAD4(in_buf.in_vec[reg], inputs + in_offset); - if (curr_x + 1 >= input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s1 = 0; - if (curr_x + 2 >= input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s2 = 0; - if (curr_x + 3 >= input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s3 = 0; - } - } else { - in_buf.in_vec[reg] = 0; + if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H && + curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W)) + { + in_buf[reg] = 0; } - curr_y += TILE_Y_STRIDE; -#else - VLOAD4(in_buf.in_vec[reg], inputs + in_offset); #endif - } - in_offset += input_width * TILE_Y_STRIDE; - }); - in_addr += input_height * input_width; + curr_y += 1; + in_offset += INPUT_WIDTH; + } + + in_addr += INPUT_PITCH; + #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y = saved_y; #endif -#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 -#define WEIGHT_PREF 8 -#else -#define WEIGHT_PREF 1 -#endif - union { - Dtype w[WEIGHT_PREF]; -#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 - INT_TYPE8 ui8; -#endif - } weight_buf; + Dtype weight_buf[WEIGHT_PREF]; int w_idx=0; - unsigned int orig_weight_addr = weight_addr; -#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 - weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); - weight_addr += SIMD_SIZE * WEIGHT_PREF; -#else - weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr])); - weight_addr += SIMD_SIZE * 1; -#endif + for (int i = 0; i < WEIGHT_PREF; i++) + { + weight_buf[i] = weights[weight_addr]; + weight_addr += SIMD_SIZE; + } -#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4)) +#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c)) int kr = 0; // kr = Kernel Row LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop. + { + int kc = 0; // kc = Kernel Column + LOOP(KERNEL_WIDTH, kc, { - int kc = 0; // kc = Kernel Column - LOOP(KERNEL_WIDTH, kc, - { - for(int br=0; br < OUT_BLOCK_HEIGHT; br++) { - for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) { - Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X); - out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); - } - } -#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF - // We assume KERNEL_W is equal to KERNEL_H here. - if ((w_idx + 1) % WEIGHT_PREF == 0 - #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0 - && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)) - #endif - ) { - weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); - weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. + for (int br=0; br < OUT_BLOCK_HEIGHT; br++) + { + for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) + { + Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X); + out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); } - #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0 - // need to do nothing - #else - else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))) - #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1 - weight_buf.w[0] = weights[weight_addr]; - #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2 - weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]); - #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4 - weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]); - #else - weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); - #endif - #endif -#endif - ++w_idx; - }); + } + weight_buf[w_idx % WEIGHT_PREF] = weights[weight_addr]; + weight_addr += SIMD_SIZE; + ++w_idx; }); - weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE; - - } - // dead code to work around possible compiler bug. - if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) { - outputs[0] = BLOCK_IN(fm % SIMD_SIZE); + }); + weight_addr -= WEIGHT_PREF * SIMD_SIZE; } + fm = fm % ALIGNED_NUM_FILTERS; - if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) { - unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height; - out_addr += or * output_width + oc; - // we need this address calculation for biases because we support views and batching +#if LEFT_FILTERS > 0 + if (fm < NUM_FILTERS) +#endif + { + unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH; + out_addr += or * output_width + oc; + // we need this address calculation for biases because we support views and batching #if APPLY_BIAS - Dtype bias = biases_base[fm]; + Dtype bias = biases_base[fm]; #else - Dtype bias = 0; + Dtype bias = 0; #endif - for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) { + + for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) + { if (r + or >= output_height) break; - for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) { + for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) + { if (c + oc >= output_width) break; - // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer. + // this does a scattered write to SIMD_SIZE different feature maps, + // so that data within one map is contiguous, thus ready for input to next layer. ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm); - } } } -- 2.7.4