in_arg {
name: "input_min"
description: <<END
-If range_given, this is the min of the range, otherwise this input
-will be ignored.
+If `range_given == True`, this specifies the minimum input value that needs to
+be represented, otherwise it is determined from the min value of the `input`
+tensor.
END
}
in_arg {
name: "input_max"
description: <<END
-If range_given, this is the max of the range, otherwise this input
-will be ignored.
+If `range_given == True`, this specifies the maximum input value that needs to
+be represented, otherwise it is determined from the max value of the `input`
+tensor.
END
}
attr {
name: "signed_input"
description: <<END
-If the quantization is signed or unsigned.
+Whether the quantization is signed or unsigned. (actually this parameter should
+have been called <b>`signed_output`</b>)
END
}
attr {
attr {
name: "range_given"
description: <<END
-If the range is given or should be computed from the tensor.
+Whether the range is given or should be determined from the `input` tensor.
END
}
summary: "Quantizes then dequantizes a tensor."
2. Dequantizing it back to floating point numbers for the following ops, most
likely matmul.
-There are different ways to quantize. This version does not use the full range
-of the output type, choosing to elide the lowest possible value for symmetry
-(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
-quantization), so that 0.0 maps to 0.
+There are different ways to quantize. This version uses only scaling, so 0.0
+maps to 0.
-To perform this op, we first find the range of values in our tensor. The range
-we use is always centered on 0, so we find m such that
+From the specified 'num_bits' in the quantized output type, it determines
+minimum and maximum representable quantized values.
-1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
+e.g.
-Our input tensor range is then [-m, m].
+* [-128, 127] for signed, num_bits = 8, or
+* [0, 255] for unsigned, num_bits = 8.
-Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
-If signed_input is true, this is
+If range_given == False, the initial input_min, input_max will be determined
+automatically as the minimum and maximum values in the input tensor, otherwise
+the specified values of input_min, input_max are used.
- [min_fixed, max_fixed ] =
- [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+Note: If the input_min, input_max are specified, they do not need to equal the
+actual minimum and maximum values in the tensor. e.g. in some cases it may be
+beneficial to specify these values such that the low probability extremes of the
+input distribution are clipped.
-Otherwise, if signed_input is false, the fixed-point range is
+This op determines the maximum scale_factor that would map the initial
+[input_min, input_max] range to a range that lies within the representable
+quantized range.
- [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+It determines the scale from one of input_min and input_max, then updates the
+other one to maximize the respresentable range.
-From this we compute our scaling factor, s:
+e.g.
- s = (max_fixed - min_fixed) / (2 * m).
+* if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+ 5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
+ would update input_max to be 127 / 12.8 = 9.921875
+* if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+ 10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
+ would update input_min to be 128.0 / 12.7 = -10.07874
+* if the output is unsigned, input_min is forced to be 0, and only the
+ specifide input_max is used.
-Now we can quantize and dequantize the elements of our tensor. An element e
-is transformed into e':
+After determining the scale_factor and updating the input tange, it applies the
+following to each value in the 'input' tensor.
- e' = (e * s).round_to_nearest() / s.
+output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
-Note that we have a different number of buckets in the signed vs. unsigned
-cases. For example, if num_bits == 8, we get 254 buckets in the signed case
-vs. 255 in the unsigned case.
-
-For example, suppose num_bits = 8 and m = 1. Then
-
- [min_fixed, max_fixed] = [-127, 127], and
- s = (127 + 127) / 2 = 127.
-
-Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
-{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
END
}
namespace tensorflow {
namespace functor {
+// TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
+
template <typename Device, typename T>
struct QuantizeAndDequantizeOneScaleFunctor {
void operator()(const Device& d, typename TTypes<T>::ConstVec input,
d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
- // Make sure the range is symmetric for signed quantization, or start from
- // 0 for unsigned quantization.
- max_range = std::max(std::abs(max_range), std::abs(min_range));
+ // Calculate the range for the simulated integer quantization:
+ // e.g. [-128,127] for signed = true, num_bits = 8,
+ // or [0, 255] for signed = false, num_bits = 8.
+ const int64 min_quantized = signed_input ? -(1ULL << (num_bits - 1)) : 0;
+ const int64 max_quantized = min_quantized + ((1ULL << num_bits) - 1);
- // If both min and max are 0, then the output should be just 0.
- if (max_range == 0) {
- out.device(d) = input.constant(T(0));
- return;
- }
+ // Determine the maximum scaling factor that would scale
+ // [min_range, max_range] to not exceed [min_quantized, max_quantized],
+ // while keeping 0 unchanged.
+ const T scale_from_min_side = (min_quantized * min_range > 0)
+ ? min_quantized / min_range
+ : std::numeric_limits<T>::max();
+ const T scale_from_max_side = (max_quantized * max_range > 0)
+ ? max_quantized / max_range
+ : std::numeric_limits<T>::max();
- if (signed_input) {
- min_range = -max_range;
+ // Note: Avoids changing the side of the range that determines scale.
+ T scale, inverse_scale;
+ if (scale_from_min_side < scale_from_max_side) {
+ scale = scale_from_min_side;
+ inverse_scale = min_range / min_quantized;
+ max_range = max_quantized * inverse_scale;
+ } else {
+ scale = scale_from_max_side;
+ inverse_scale = max_range / max_quantized;
+ min_range = min_quantized * inverse_scale;
+ }
- // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
- // example, if it is 8 bits, we have the range [-127, 127]. So for input
- // range of [-x, x], the scale should be 254/(2*x).
- T scale = static_cast<T>((uint64_t{1} << (num_bits - 1)) - 1) / max_range;
- T inverse_scale = T(1.0) / scale;
- if (range_given) {
- out.device(d) =
- ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
- scale +
- T(0.5))
- .floor() *
- inverse_scale +
- min_range;
- } else {
- // No need to compare with min and max as they are measured from the
- // tensor.
- out.device(d) =
- ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
- min_range;
- }
+ if (range_given) {
+ // Note: The clamping here is to avoid overflow in the quantized type.
+ // The semantics of the op does not guarantee to clamp to the specified
+ // min_range and max_range - because we may have changed either min_range
+ // or max_range.
+ out.device(d) =
+ ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) * scale +
+ T(0.5))
+ .floor() *
+ inverse_scale +
+ min_range;
} else {
- min_range = 0;
- // If it is unsigned and num_bits == 8, the range with 8 bits is [0, 255].
- // If the input range is [0, x], then the scale is x/255 instead of 254 as
- // in the case above.
- T scale = static_cast<T>((uint64_t{1} << num_bits) - 1) / max_range;
- T inverse_scale = 1.0 / scale;
- if (range_given) {
- out.device(d) =
- ((input.cwiseMin(max_range).cwiseMax(min_range)) * scale + T(0.5))
- .floor() *
- inverse_scale;
- } else {
- // No need to compare with min and max as they are measured from the
- // tensor.
- out.device(d) = (input * scale + T(0.5)).floor() * inverse_scale;
- }
+ // No need to clamp to min_range and max_range in this case as they were
+ // measured from the tensor.
+ out.device(d) =
+ ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
+ min_range;
}
}
};
AddInputFromArray<float>(TensorShape({}), {0.0}); // Min
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
- // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
+ // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
// Scale is: 1/127
- // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+ // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(
- &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+ test::FillValues<float>(&expected,
+ {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
AddInputFromArray<int32>(TensorShape({}), {8}); // num_bits
- // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
- // Scale is: 1/127
- // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+ // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+ // Scale is: 1/128
+ // Then it is dequantized to {-1, -64.0/128, 0, 38.0/128, 102.0/128, 71.0/128}
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(
- &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+ test::FillValues<float>(&expected,
+ {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
AddInputFromArray<float>(TensorShape({}), {0.0}); // Min
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
- // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
- // Scale is: 1/7
+ // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+ // Scale is: 1/8
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(&expected,
- {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+ test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
AddInputFromArray<int32>(TensorShape({}), {4}); // num_bits
- // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
- // Scale is: 1/7
+ // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+ // Scale is: 1/8
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(&expected,
- {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+ test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
AddInputFromArray<float>(TensorShape({}), {1.0}); // Max
// Note that the range is given as [-1, 1].
- // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+ // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
// 127}.
// Scale is: 1/127
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
- test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
- 102.0 / 127, 70.0 / 127, -1, 1});
+ test::FillValues<float>(
+ &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+ 70.0 / 127, -128.0 / 127, 1});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
}
AddInputFromArray<int32>(TensorShape({}), {8}); // num_bits
// Note that the range is given as [-1, 1].
- // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+ // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
// 127}.
// Scale is: 1/127
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
- test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
- 102.0 / 127, 70.0 / 127, -1, 1});
+ test::FillValues<float>(
+ &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+ 70.0 / 127, -128.0 / 127, 1});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
}