From f0ee72be8fad66c6916692378292b0a54b0de5a7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 May 2018 17:43:46 -0700
Subject: [PATCH] Make the quantize_and_dequantize op use the full quantized
 range when possible.

PiperOrigin-RevId: 197226707
---
 .../core/kernels/quantize_and_dequantize_op.h      | 75 ++++++++--------------
 .../kernels/quantize_and_dequantize_op_test.cc     | 32 +++++----
 2 files changed, 43 insertions(+), 64 deletions(-)
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 3b09ea2..7ba41b4 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -49,56 +49,37 @@ struct QuantizeAndDequantizeOneScaleImpl {
     d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
     d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
 
-    // Make sure the range is symmetric for signed quantization, or start from
-    // 0 for unsigned quantization.
-    max_range = std::max(std::abs(max_range), std::abs(min_range));
+    // Calculate the range for the simulated integer quantization:
+    // e.g. [-128,127] for signed = true, num_bits = 8,
+    // or [0, 255] for signed = false, num_bits = 8.
+    const int64 min_quantized = signed_input ? -(1ULL << (num_bits - 1)) : 0;
+    const int64 max_quantized = min_quantized + ((1ULL << num_bits) - 1);
 
-    // If both min and max are 0, then the output should be just 0.
-    if (max_range == 0) {
-      out.device(d) = input.constant(T(0));
-      return;
-    }
-
-    if (signed_input) {
-      min_range = -max_range;
+    // Determine the maximum scaling factor that would scale
+    // [min_range, max_range] to not exceed [min_quantized, max_quantized],
+    // while keeping 0 unchanged.
+    const T scale_from_min_side = (min_quantized * min_range > 0)
+                                      ? min_quantized / min_range
+                                      : std::numeric_limits<T>::max();
+    const T scale_from_max_side = (max_quantized * max_range > 0)
+                                      ? max_quantized / max_range
+                                      : std::numeric_limits<T>::max();
+    auto scale = std::min(scale_from_min_side, scale_from_max_side);
 
-      // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
-      // example, if it is 8 bits, we have the range [-127, 127]. So for input
-      // range of [-x, x], the scale should be 254/(2*x).
-      T scale = static_cast<T>((uint64_t{1} << (num_bits - 1)) - 1) / max_range;
-      T inverse_scale = T(1.0) / scale;
-      if (range_given) {
-        out.device(d) =
-            ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
-                 scale +
-             T(0.5))
-                    .floor() *
-                inverse_scale +
-            min_range;
-      } else {
-        // No need to compare with min and max as they are measured from the
-        // tensor.
-        out.device(d) =
-            ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
-            min_range;
-      }
+    T inverse_scale = T(1.0) / scale;
+    if (range_given) {
+      out.device(d) =
+          ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) * scale +
+           T(0.5))
+                  .floor() *
+              inverse_scale +
+          min_range;
     } else {
-      min_range = 0;
-      // If it is unsigned and num_bits == 8, the range with 8 bits is [0, 255].
-      // If the input range is [0, x], then the scale is x/255 instead of 254 as
-      // in the case above.
-      T scale = static_cast<T>((uint64_t{1} << num_bits) - 1) / max_range;
-      T inverse_scale = 1.0 / scale;
-      if (range_given) {
-        out.device(d) =
-            ((input.cwiseMin(max_range).cwiseMax(min_range)) * scale + T(0.5))
-                .floor() *
-            inverse_scale;
-      } else {
-        // No need to compare with min and max as they are measured from the
-        // tensor.
-        out.device(d) = (input * scale + T(0.5)).floor() * inverse_scale;
-      }
+      // No need to compare with min and max as they are measured from the
+      // tensor.
+      out.device(d) =
+          ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
+          min_range;
     }
   }
 };
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index e41df12..f9f20d0 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -105,13 +105,13 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
   // Scale is: 1/127
-  // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(
-      &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+  test::FillValues<float>(&expected,
+                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -136,13 +136,13 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_V3) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
   AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
 
-  // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
-  // Scale is: 1/127
-  // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+  // Scale is: 1/128
+  // Then it is dequantized to {-1, -64.0/128, 0, 38.0/128, 102.0/128, 71.0/128}
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(
-      &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+  test::FillValues<float>(&expected,
+                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -166,12 +166,11 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
-  // Scale is: 1/7
+  // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+  // Scale is: 1/8
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -196,12 +195,11 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
   AddInputFromArray<int32>(TensorShape({}), {4});    // num_bits
 
-  // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
-  // Scale is: 1/7
+  // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+  // Scale is: 1/8
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
-- 
2.7.4