From: levytamar82 Date: Thu, 24 Jul 2014 20:40:21 +0000 (-0700) Subject: 32 Align Load bug X-Git-Tag: v1.4.0~1019^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=efdfdf578792b0a9ed93e3b9dc747e1bfe7f5c0c;p=platform%2Fupstream%2Flibvpx.git 32 Align Load bug In the sub_pixel_avg_variance the parameter sec was also aligned load and changed to unaligned. Change-Id: I4d4966e0291059ea4d705baed1503dc58444fcb7 --- diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c index 9aa4da9..a441cad 100644 --- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c +++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c @@ -333,7 +333,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, if (y_offset == 0) { for (i = 0; i < height ; i++) { LOAD_SRC_DST - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); sec+= sec_stride; // expend each byte to 2 bytes @@ -347,7 +347,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, for (i = 0; i < height ; i++) { LOAD_SRC_DST AVG_NEXT_SRC(src_reg, src_stride) - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); sec+= sec_stride; // expend each byte to 2 bytes @@ -369,7 +369,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, MERGE_NEXT_SRC(src_reg, src_stride) FILTER_SRC(filter) src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); sec+= sec_stride; MERGE_WITH_SRC(src_reg, zero_reg) @@ -385,7 +385,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, for (i = 0; i < height ; i++) { LOAD_SRC_DST AVG_NEXT_SRC(src_reg, 1) - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); sec+= sec_stride; // expand each byte to 2 bytes @@ -409,7 +409,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, AVG_NEXT_SRC(src_reg, 1) // average between previous average to current average src_avg = _mm256_avg_epu8(src_avg, src_reg); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_avg = _mm256_avg_epu8(src_avg, sec_reg); sec+= sec_stride; // expand each byte to 2 bytes @@ -437,7 +437,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, MERGE_WITH_SRC(src_avg, src_reg) FILTER_SRC(filter) src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_avg = _mm256_avg_epu8(src_avg, sec_reg); // expand each byte to 2 bytes MERGE_WITH_SRC(src_avg, zero_reg) @@ -459,7 +459,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(filter) src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); MERGE_WITH_SRC(src_reg, zero_reg) sec+= sec_stride; @@ -487,7 +487,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); // average between previous pack to the current src_pack = _mm256_avg_epu8(src_pack, src_reg); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_pack = _mm256_avg_epu8(src_pack, sec_reg); sec+= sec_stride; MERGE_WITH_SRC(src_pack, zero_reg) @@ -524,7 +524,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, // filter the source FILTER_SRC(yfilter) src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_load_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); src_pack = _mm256_avg_epu8(src_pack, sec_reg); MERGE_WITH_SRC(src_pack, zero_reg) src_pack = src_reg;