Summary:
Fixes https://github.com/pytorch/pytorch/issues/66119
Failure on ARM Neoverse N1 before this PR:
```
======================================================================
FAIL: test_bitwise_ops_cpu_int16 (__main__.TestBinaryUfuncsCPU)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/opt/pytorch/pytorch/torch/testing/_internal/common_device_type.py", line 373, in instantiated_test
result = test(self, **param_kwargs)
File "test_binary_ufuncs.py", line 315, in test_bitwise_ops
self.assertEqual(op(a, b), op(a_np, b_np))
File "/opt/pytorch/pytorch/torch/testing/_internal/common_utils.py", line 1633, in assertEqual
self.assertEqual(
File "/opt/pytorch/pytorch/torch/testing/_internal/common_utils.py", line 1611, in assertEqual
super().assertTrue(result, msg=self._get_assert_msg(msg, debug_msg=debug_msg))
AssertionError: False is not true : Tensors failed to compare as equal!Found 176 different element(s) (out of 225), with the greatest difference of 21850 (-21846 vs. 4) occuring at index (0, 2).
======================================================================
FAIL: test_bitwise_ops_cpu_int32 (__main__.TestBinaryUfuncsCPU)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/opt/pytorch/pytorch/torch/testing/_internal/common_device_type.py", line 373, in instantiated_test
result = test(self, **param_kwargs)
File "test_binary_ufuncs.py", line 315, in test_bitwise_ops
self.assertEqual(op(a, b), op(a_np, b_np))
File "/opt/pytorch/pytorch/torch/testing/_internal/common_utils.py", line 1633, in assertEqual
self.assertEqual(
File "/opt/pytorch/pytorch/torch/testing/_internal/common_utils.py", line 1611, in assertEqual
super().assertTrue(result, msg=self._get_assert_msg(msg, debug_msg=debug_msg))
AssertionError: False is not true : Tensors failed to compare as equal!Found 188 different element(s) (out of 225), with the greatest difference of
1335341061 (-
1335341056 vs. 5) occuring at index (14, 8).
----------------------------------------------------------------------
```
which passes now.
CC malfet ezyang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/66194
Reviewed By: dagitses, bdhirsh, ngimel
Differential Revision:
D31430274
Pulled By: malfet
fbshipit-source-id:
bcf1c9d584c02eff328dd5b1f7af064fac5942c9
(cherry picked from commit
0b0674121aeb7d8bbcccd0461d939b64879a1273)
Signed-off-by: Eli Uriegas <eliuriegas@fb.com>
Co-authored-by: pbialecki <pbialecki@nvidia.com>
inline operator T*() {
return values;
}
+ // Return the values as char* for type punning
+ auto as_bytes() const -> const char* {
+ return reinterpret_cast<const char*>(values);
+ }
template <int64_t mask_>
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
int64_t mask = mask_;
#else
+template <typename T>
+auto load(char const* data) -> T {
+ T ret;
+ std::memcpy(&ret, data, sizeof(ret));
+ return ret;
+}
+
template<class T, typename Op>
static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) {
static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t);
__at_align__ intmax_t buffer[element_no];
- const intmax_t *a_ptr = reinterpret_cast<const intmax_t*>((const T*) a);
- const intmax_t *b_ptr = reinterpret_cast<const intmax_t*>((const T*) b);
- for (uint32_t i = 0U; i < element_no; ++ i) {
- buffer[i] = op(a_ptr[i], b_ptr[i]);
- }
+ static_assert(VECTOR_WIDTH % sizeof(intmax_t) == 0, "VECTOR_WIDTH not a multiple of sizeof(intmax_t)");
+ static_assert(sizeof(buffer) == sizeof(Vectorized<T>), "sizeof(buffer) must match sizeof(Vectorized<T>)");
+ // We should be using memcpy in order to respect the strict aliasing rule
+ // see: https://github.com/pytorch/pytorch/issues/66119
+ // Using char* is defined in the C11 standard 6.5 Expression paragraph 7
+ // (http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf)
+ const auto* a_data = a.as_bytes();
+ const auto* b_data = b.as_bytes();
+ // load each intmax_t chunk and process; increase pointers by sizeof(intmax_t)
+ for (auto& out : buffer) {
+ out = op(load<intmax_t>(a_data), load<intmax_t>(b_data));
+ a_data += sizeof(intmax_t);
+ b_data += sizeof(intmax_t);
+ }
+ assert(a_data == a.as_bytes() + sizeof(a));
+ assert(b_data == b.as_bytes() + sizeof(b));
return Vectorized<T>::loadu(buffer);
}