From: Jonathan Wright Date: Thu, 8 Jul 2021 22:27:54 +0000 (+0100) Subject: aarch64: Use memcpy to copy vector tables in vtbl[34] intrinsics X-Git-Tag: upstream/12.2.0~6209 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f2f04d8b9d1f5d4fc8c3a17c7fa5ac518574f2df;p=platform%2Fupstream%2Fgcc.git aarch64: Use memcpy to copy vector tables in vtbl[34] intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vtbl[34] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. gcc/ChangeLog: 2021-07-08 Jonathan Wright * config/aarch64/arm_neon.h (vtbl3_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vtbl3_u8): Likewise. (vtbl3_p8): Likewise. (vtbl4_s8): Likewise. (vtbl4_u8): Likewise. (vtbl4_p8): Likewise. --- diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index a7b8449..0ec46ef 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9682,11 +9682,9 @@ vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx) int8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_s8 (__tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_s8 (__tab.val[2], + vcreate_s8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return __builtin_aarch64_qtbl2v8qi (__o, __idx); } @@ -9697,11 +9695,9 @@ vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx) uint8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_u8 (__tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_u8 (__tab.val[2], + vcreate_u8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9712,11 +9708,9 @@ vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx) poly8x16x2_t __temp; __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); - __temp.val[1] = vcombine_p8 (__tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __temp.val[1] = vcombine_p8 (__tab.val[2], + vcreate_p8 (__AARCH64_UINT64_C (0x0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9728,10 +9722,7 @@ vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return __builtin_aarch64_qtbl2v8qi (__o, __idx); } @@ -9743,10 +9734,7 @@ vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return (uint8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); } @@ -9758,10 +9746,7 @@ vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx) __builtin_aarch64_simd_oi __o; __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); return(poly8x8_t)__builtin_aarch64_qtbl2v8qi (__o, (int8x8_t)__idx); }