d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
- // Calculate the range for the simulated integer quantization:
- // e.g. [-128,127] for signed = true, num_bits = 8,
- // or [0, 255] for signed = false, num_bits = 8.
- const int64 min_quantized = signed_input ? -(1ULL << (num_bits - 1)) : 0;
- const int64 max_quantized = min_quantized + ((1ULL << num_bits) - 1);
+ // Make sure the range is symmetric for signed quantization, or start from
+ // 0 for unsigned quantization.
+ max_range = std::max(std::abs(max_range), std::abs(min_range));
- // Determine the maximum scaling factor that would scale
- // [min_range, max_range] to not exceed [min_quantized, max_quantized],
- // while keeping 0 unchanged.
- const T scale_from_min_side = (min_quantized * min_range > 0)
- ? min_quantized / min_range
- : std::numeric_limits<T>::max();
- const T scale_from_max_side = (max_quantized * max_range > 0)
- ? max_quantized / max_range
- : std::numeric_limits<T>::max();
- auto scale = std::min(scale_from_min_side, scale_from_max_side);
+ // If both min and max are 0, then the output should be just 0.
+ if (max_range == 0) {
+ out.device(d) = input.constant(T(0));
+ return;
+ }
+
+ if (signed_input) {
+ min_range = -max_range;
- T inverse_scale = T(1.0) / scale;
- if (range_given) {
- out.device(d) =
- ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) * scale +
- T(0.5))
- .floor() *
- inverse_scale +
- min_range;
+ // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
+ // example, if it is 8 bits, we have the range [-127, 127]. So for input
+ // range of [-x, x], the scale should be 254/(2*x).
+ T scale = static_cast<T>((uint64_t{1} << (num_bits - 1)) - 1) / max_range;
+ T inverse_scale = T(1.0) / scale;
+ if (range_given) {
+ out.device(d) =
+ ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
+ scale +
+ T(0.5))
+ .floor() *
+ inverse_scale +
+ min_range;
+ } else {
+ // No need to compare with min and max as they are measured from the
+ // tensor.
+ out.device(d) =
+ ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
+ min_range;
+ }
} else {
- // No need to compare with min and max as they are measured from the
- // tensor.
- out.device(d) =
- ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
- min_range;
+ min_range = 0;
+ // If it is unsigned and num_bits == 8, the range with 8 bits is [0, 255].
+ // If the input range is [0, x], then the scale is x/255 instead of 254 as
+ // in the case above.
+ T scale = static_cast<T>((uint64_t{1} << num_bits) - 1) / max_range;
+ T inverse_scale = 1.0 / scale;
+ if (range_given) {
+ out.device(d) =
+ ((input.cwiseMin(max_range).cwiseMax(min_range)) * scale + T(0.5))
+ .floor() *
+ inverse_scale;
+ } else {
+ // No need to compare with min and max as they are measured from the
+ // tensor.
+ out.device(d) = (input * scale + T(0.5)).floor() * inverse_scale;
+ }
}
}
};
AddInputFromArray<float>(TensorShape({}), {0.0}); // Min
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
- // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+ // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
// Scale is: 1/127
- // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
+ // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(&expected,
- {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
+ test::FillValues<float>(
+ &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
AddInputFromArray<int32>(TensorShape({}), {8}); // num_bits
- // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
- // Scale is: 1/128
- // Then it is dequantized to {-1, -64.0/128, 0, 38.0/128, 102.0/128, 71.0/128}
+ // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
+ // Scale is: 1/127
+ // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(&expected,
- {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
+ test::FillValues<float>(
+ &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
AddInputFromArray<float>(TensorShape({}), {0.0}); // Min
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
- // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
- // Scale is: 1/8
+ // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
+ // Scale is: 1/7
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
+ test::FillValues<float>(&expected,
+ {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.
AddInputFromArray<float>(TensorShape({}), {0.0}); // Max
AddInputFromArray<int32>(TensorShape({}), {4}); // num_bits
- // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
- // Scale is: 1/8
+ // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
+ // Scale is: 1/7
TF_ASSERT_OK(RunOpKernel());
Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
- test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
+ test::FillValues<float>(&expected,
+ {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
// Ensure that the inputs haven't been changed.