From 5fdafc2bc51f7b9fd05041af2585bfd762a233bd Mon Sep 17 00:00:00 2001 From: Hermet Park Date: Fri, 3 Sep 2021 11:07:30 +0900 Subject: [PATCH] Revert "sw_engine avx: the not aligned memory separated in the rasterRGBA32 func" This reverts commit dafc229a757b71feb3f67c8bf4f8073f662d6964. --- src/lib/sw_engine/tvgSwRasterAvx.h | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/src/lib/sw_engine/tvgSwRasterAvx.h b/src/lib/sw_engine/tvgSwRasterAvx.h index 32a271d..6e70868 100644 --- a/src/lib/sw_engine/tvgSwRasterAvx.h +++ b/src/lib/sw_engine/tvgSwRasterAvx.h @@ -25,7 +25,6 @@ #include #define N_32BITS_IN_128REG 4 -#define N_32BITS_IN_256REG 8 static inline __m128i ALPHA_BLEND(__m128i c, __m128i a) { @@ -64,31 +63,24 @@ static inline __m128i ALPHA_BLEND(__m128i c, __m128i a) static inline void avxRasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len) { - //1. set the beginning of the array - dst += offset; - - //2. fill the not aligned memory (for a 256-bit register a 32-bytes alignment is required) - uint32_t notAligned = ((uintptr_t)dst & 0x1f) / 4; - if (notAligned) { - notAligned = (N_32BITS_IN_256REG - notAligned > (uint32_t)len ? (uint32_t)len : N_32BITS_IN_256REG - notAligned); - for (uint32_t x = 0; x < notAligned; ++x) *dst++ = val; - } + //1. calculate how many iterations we need to cover the length + uint32_t iterations = len / 8; + uint32_t avxFilled = iterations * 8; - //3. calculate how many iterations we need to cover the length - uint32_t iterations = (len - notAligned) / N_32BITS_IN_256REG; - uint32_t avxFilled = iterations * N_32BITS_IN_256REG; + //2. set the beginning of the array + dst += offset; + __m256i_u* avxDst = (__m256i_u*) dst; - //4. fill the octets - __m256i avxVal = _mm256_set1_epi32(val); - auto avxDst = (__m256i*)dst; + //3. fill the octets for (uint32_t i = 0; i < iterations; ++i) { - *avxDst = avxVal; + *avxDst = _mm256_set1_epi32(val); avxDst++; } - //4. fill leftovers (in the first step set the pointer to the place where the avx job is done) - int32_t leftovers = len - notAligned - avxFilled; + //4. fill leftovers (in the first step we have to set the pointer to the place where the avx job is done) + int32_t leftovers = len - avxFilled; dst += avxFilled; + while (leftovers--) *dst++ = val; } -- 2.7.4