In the variance calculations the difference is summed and later squared.
When the sum exceeds sqrt(2^31) the value is treated as a negative when
it is shifted which gives incorrect results.
To fix this we cast the result of the multiplication as unsigned.
The alternative fix is to shift sum down by 4 before multiplying.
However that will reduce precision.
For 16x16 blocks the maximum sum is 65280 and sqrt(2^31) is 46340 (and
change).
PPC change is untested.
Change-Id: I1bad27ea0720067def6d71a6da5f789508cec265
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
;vmov.32 r1, d1[0]
;mul r0, r0, r0
;str r1, [r12]
- ;sub r0, r1, r0, asr #8
+ ;sub r0, r1, r0, lsr #8
- ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
- ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+ ; while sum is signed, sum * sum is always positive and must be treated as
+ ; unsigned to avoid propagating the sign bit.
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
- vshr.s32 d10, d10, #7
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #7
+ vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
- vshr.s32 d10, d10, #7
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #7
+ vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
- vshr.s32 d10, d10, #6
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #6
+ vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r6] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
add sp, sp, #528
vmov.32 r0, d0[0] ;return
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
add sp, sp, #256
vmov.32 r0, d0[0] ;return
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #6
- vsub.s32 d0, d1, d10
+ vshr.u32 d10, d10, #6
+ vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {r4-r5, pc}
stw r4, 0(r7) ;# sse
mullw r3, r3, r3 ;# sum*sum
- srawi r3, r3, \DS ;# (sum*sum) >> DS
+ srlwi r3, r3, \DS ;# (sum*sum) >> DS
subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
.endm
stw r4, 0(r7) ;# sse
mullw r3, r3, r3 ;# sum*sum
- srawi r3, r3, \DS ;# (sum*sum) >> 8
+ srlwi r3, r3, \DS ;# (sum*sum) >> 8
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
.endm
stw r4, 0(r7) ;# sse
mullw r3, r3, r3 ;# sum*sum
- srawi r3, r3, 4 ;# (sum*sum) >> 4
+ srlwi r3, r3, 4 ;# (sum*sum) >> 4
subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
epilogue
stw r4, 0(r9) ;# sse
mullw r3, r3, r3 ;# sum*sum
- srawi r3, r3, \DS ;# (sum*sum) >> 8
+ srlwi r3, r3, \DS ;# (sum*sum) >> 8
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
.endm
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
*sse = var;
- return (var - ((avg * avg) >> 8));
+ return (var - ((unsigned int)(avg * avg) >> 8));
}
unsigned int vp8_variance8x16_c(
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
*sse = var;
- return (var - ((avg * avg) >> 7));
+ return (var - ((unsigned int)(avg * avg) >> 7));
}
unsigned int vp8_variance16x8_c(
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
*sse = var;
- return (var - ((avg * avg) >> 7));
+ return (var - ((unsigned int)(avg * avg) >> 7));
}
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
*sse = var;
- return (var - ((avg * avg) >> 6));
+ return (var - ((unsigned int)(avg * avg) >> 6));
}
unsigned int vp8_variance4x4_c(
variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
*sse = var;
- return (var - ((avg * avg) >> 4));
+ return (var - ((unsigned int)(avg * avg) >> 4));
}
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
*sse = var;
- return (var - ((avg * avg) >> 4));
+ return (var - ((unsigned int)(avg * avg) >> 4));
}
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
*sse = var;
- return (var - ((avg * avg) >> 6));
+ return (var - ((unsigned int)(avg * avg) >> 6));
}
var = sse0 + sse1 + sse2 + sse3;
avg = sum0 + sum1 + sum2 + sum3;
*sse = var;
- return (var - ((avg * avg) >> 8));
+ return (var - ((unsigned int)(avg * avg) >> 8));
}
unsigned int vp8_variance16x8_mmx(
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
- return (var - ((avg * avg) >> 7));
+ return (var - ((unsigned int)(avg * avg) >> 7));
}
avg = sum0 + sum1;
*sse = var;
- return (var - ((avg * avg) >> 7));
+ return (var - ((unsigned int)(avg * avg) >> 7));
}
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
*sse = var;
- return (var - ((avg * avg) >> 4));
+ return (var - ((unsigned int)(avg * avg) >> 4));
}
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
*sse = var;
- return (var - ((avg * avg) >> 6));
+ return (var - ((unsigned int)(avg * avg) >> 6));
}
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
- return (var - ((avg * avg) >> 7));
+ return (var - ((unsigned int)(avg * avg) >> 7));
}
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
- return (var - ((avg * avg) >> 7));
+ return (var - ((unsigned int)(avg * avg) >> 7));
}
}
*sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 8));
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
}
unsigned int vp8_sub_pixel_variance16x8_ssse3
}
*sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 7));
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
}