From 3a17101171d3e51fcba2189d09416c5106bfe4ac Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 23 May 2018 16:33:27 -0700 Subject: [PATCH] Make depthwiseconv handler handle filter ranges beyond 255 PiperOrigin-RevId: 197810361 --- tensorflow/contrib/lite/kernels/conv.cc | 1 + tensorflow/contrib/lite/kernels/depthwise_conv.cc | 5 +- .../contrib/lite/kernels/depthwise_conv_test.cc | 43 ++++++++++++-- tensorflow/contrib/lite/kernels/fully_connected.cc | 1 + .../internal/optimized/depthwiseconv_uint8.h | 67 +++++++++++++++------- .../optimized/depthwiseconv_uint8_3x3_filter.h | 22 ++++--- .../internal/reference/depthwiseconv_uint8.h | 4 +- tensorflow/contrib/lite/kernels/kernel_util.cc | 1 - 8 files changed, 100 insertions(+), 44 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc index 0b35a22..ee42e5c 100644 --- a/tensorflow/contrib/lite/kernels/conv.cc +++ b/tensorflow/contrib/lite/kernels/conv.cc @@ -254,6 +254,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); + TF_LITE_ENSURE(context, real_multiplier < 1.0); QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier, &data->output_shift); CalculateActivationRangeUint8(params->activation, output, diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc index abb2549..a308de0 100644 --- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc +++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc @@ -151,8 +151,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); - QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier, - &data->output_shift); + int exponent; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); + data->output_shift = -exponent; CalculateActivationRangeUint8(params->activation, output, &data->output_activation_min, &data->output_activation_max); diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc index 1439c8b..c00cafb 100644 --- a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc +++ b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc @@ -47,12 +47,6 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel { } output_ = AddOutput(output); - if (input.type != TensorType_FLOAT32) { - // The following is required by quantized inference. It is the unittest's - // responsibility to make sure the output scale falls into the correct - // range. - CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_)); - } int input_depth = GetShape(input_)[3]; int output_depth = GetShape(filter_)[3]; @@ -176,6 +170,43 @@ TEST(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) { })); } +TEST(QuantizedDepthwiseConvolutionOpTest, + SimpleTestQuantizedFilterMultiplierGreaterThan1) { + QuantizedDepthwiseConvolutionOpModel quant_op( + {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64}, + {TensorType_UINT8, {1, 2, 2, 4}, -128.5, 128}, + {TensorType_UINT8, {}, -127, 128}); + DepthwiseConvolutionOpModel float_op({TensorType_FLOAT32, {1, 3, 2, 2}}, + {TensorType_FLOAT32, {1, 2, 2, 4}}, + {TensorType_FLOAT32, {}}); + + std::initializer_list input = { + 1, 2, 7, 8, // column 1 + 3, 4, 9, 10, // column 2 + 5, 6, 11, 12, // column 3 + }; + std::initializer_list filter = { + 1, 2, 3, 4, // + -9, 10, -11, 12, // + 5, 6, 7, 8, // + 13, -14, 15, -16, // + }; + std::initializer_list bias = {1, 2, 3, 4}; + + quant_op.SetInput(input); + quant_op.SetFilter(filter); + quant_op.SetBias(bias); + quant_op.Invoke(); + + float_op.SetInput(input); + float_op.SetFilter(filter); + float_op.SetBias(bias); + float_op.Invoke(); + + EXPECT_THAT(quant_op.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear(float_op.GetOutput(), 1))); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc index 1b942a1..9899206 100644 --- a/tensorflow/contrib/lite/kernels/fully_connected.cc +++ b/tensorflow/contrib/lite/kernels/fully_connected.cc @@ -117,6 +117,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); + TF_LITE_ENSURE(context, real_multiplier < 1.0); QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier, &data->output_shift); CalculateActivationRangeUint8(params->activation, output, diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h index dd6932f..75cf987 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h @@ -1691,14 +1691,16 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, const int filter_width = ArraySize(filter_dims, 1); const int output_height = ArraySize(output_dims, 2); const int output_width = ArraySize(output_dims, 1); + const bool shift_left = (output_shift <= 0); + const int32 multiplier_power_of_two = shift_left ? (1 << -output_shift) : 1; TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); #ifdef __aarch64__ // Call kernel optimized for depthwise convolutions using 3x3 filters if // parameters are supported. - if (Fast3x3FilterKernelSupported(input_dims, filter_dims, stride_width, - stride_height, pad_width, pad_height, - depth_multiplier, output_dims)) { + if (Fast3x3FilterKernelSupported( + input_dims, filter_dims, stride_width, stride_height, pad_width, + pad_height, depth_multiplier, output_dims, output_shift)) { DepthwiseConv3x3Filter(input_data, input_dims, input_offset, filter_data, filter_dims, filter_offset, bias_data, bias_dims, stride_width, stride_height, pad_width, pad_height, @@ -1833,12 +1835,20 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, acc[j] = vld1q_s32(acc_buffer + i + 4 * j); } - // Fixed-point multiplication. - for (int j = 0; j < 4; j++) { - acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); - } - for (int j = 0; j < 4; j++) { - acc[j] = RoundingDivideByPOT(acc[j], output_shift); + if (!shift_left) { + // Fixed-point multiplication. + for (int j = 0; j < 4; j++) { + acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); + } + for (int j = 0; j < 4; j++) { + acc[j] = RoundingDivideByPOT(acc[j], output_shift); + } + } else { + // Fixed-point multiplication. + for (int j = 0; j < 4; j++) { + acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two); + acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); + } } // Add the output offset. for (int j = 0; j < 4; j++) { @@ -1870,12 +1880,21 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, for (; i <= num_output_values - 8; i += 8) { int32x4_t acc0 = vld1q_s32(acc_buffer + i); int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); - // Fixed-point multiplication. - acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); - acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); - // Rounding right shift. - acc0 = RoundingDivideByPOT(acc0, output_shift); - acc1 = RoundingDivideByPOT(acc1, output_shift); + if (!shift_left) { + // Fixed-point multiplication. + acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); + acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); + // Rounding right shift. + acc0 = RoundingDivideByPOT(acc0, output_shift); + acc1 = RoundingDivideByPOT(acc1, output_shift); + } else { + // Fixed-point multiplication. + acc0 = vmulq_n_s32(acc0, multiplier_power_of_two); + acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); + + acc1 = vmulq_n_s32(acc1, multiplier_power_of_two); + acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); + } // Add the output offset. acc0 = vaddq_s32(acc0, output_offset_vec); acc1 = vaddq_s32(acc1, output_offset_vec); @@ -1899,10 +1918,16 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, // that will have to go through the very slow scalar code. for (; i <= num_output_values - 4; i += 4) { int32x4_t acc = vld1q_s32(acc_buffer + i); - // Fixed-point multiplication. - acc = vqrdmulhq_n_s32(acc, output_multiplier); - // Rounding right shift. - acc = RoundingDivideByPOT(acc, output_shift); + if (!shift_left) { + // Fixed-point multiplication. + acc = vqrdmulhq_n_s32(acc, output_multiplier); + // Rounding right shift. + acc = RoundingDivideByPOT(acc, output_shift); + } else { + // Fixed-point multiplication. + acc = vmulq_n_s32(acc, multiplier_power_of_two); + acc = vqrdmulhq_n_s32(acc, output_multiplier); + } // Add the output offset. acc = vaddq_s32(acc, output_offset_vec); // Apply the activation function. @@ -1923,8 +1948,8 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, // Handle leftover values, one by one. This is very slow. for (; i < num_output_values; i++) { int32 acc = acc_buffer[i]; - acc = MultiplyByQuantizedMultiplierSmallerThanOne( - acc, output_multiplier, output_shift); + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h index 55e0d5c..9b1a45e 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h @@ -4408,12 +4408,10 @@ struct ConvRow3x3FilterDepth8<8, 1, 1> { } }; -inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims, - const Dims<4>& filter_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, - int depth_multiplier, - const Dims<4>& output_dims) { +inline bool Fast3x3FilterKernelSupported( + const Dims<4>& input_dims, const Dims<4>& filter_dims, int stride_width, + int stride_height, int pad_width, int pad_height, int depth_multiplier, + const Dims<4>& output_dims, int output_shift) { const int input_height = ArraySize(input_dims, 2); const int input_width = ArraySize(input_dims, 1); const int input_depth = ArraySize(input_dims, 0); @@ -4422,12 +4420,12 @@ inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims, const int output_height = ArraySize(output_dims, 2); const int output_width = ArraySize(output_dims, 1); - bool supported = filter_width == 3 && filter_height == 3 && - depth_multiplier == 1 && - (stride_width == 1 || stride_width == 2) && - (stride_height == 1 || stride_height == 2) && - (stride_width == stride_height) && pad_width == 0 && - pad_height == 0 && (input_depth % 8) == 0; + bool supported = + filter_width == 3 && filter_height == 3 && depth_multiplier == 1 && + (stride_width == 1 || stride_width == 2) && + (stride_height == 1 || stride_height == 2) && + (stride_width == stride_height) && pad_width == 0 && pad_height == 0 && + (input_depth % 8) == 0 && (output_shift > 0); if (!supported) { return false; diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h index e9b6bae..d577392 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h @@ -76,8 +76,8 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, if (bias_data) { acc += bias_data[Offset(bias_dims, oc, 0, 0, 0)]; } - acc = MultiplyByQuantizedMultiplierSmallerThanOne( - acc, output_multiplier, output_shift); + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc index 239b533..1840284 100644 --- a/tensorflow/contrib/lite/kernels/kernel_util.cc +++ b/tensorflow/contrib/lite/kernels/kernel_util.cc @@ -37,7 +37,6 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context, TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 1e-6 * std::min(input_product_scale, bias_scale)); TF_LITE_ENSURE(context, input_product_scale >= 0); - TF_LITE_ENSURE(context, input_product_scale < output_scale); *multiplier = input_product_scale / output_scale; -- 2.7.4