if (y_offset == 0) {
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_reg, zero_reg)
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
MERGE_WITH_SRC(src_reg, zero_reg)
sec+= sec_stride;
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_pack, zero_reg)
// filter the source
FILTER_SRC(yfilter)
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;