//
//M*/
+//data (which is float) is aligend in 32 bytes
+#define WIDTH_MULTIPLE (32 >> 2)
+
/////////////////////////////////////////////////////////
//*******************************************************
// basicretinafilter
float4 result_v4 = (float4)(0), out_v4;
float result = 0;
- // we assume elements_per_row is multple of 4
- for(int i = 0; i < 4; ++ i, -- optr)
+ // we assume elements_per_row is multple of WIDTH_MULTIPLE
+ for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr)
{
- if(i < elements_per_row - cols)
- {
- *optr = result;
- }
- else
+ if(i >= elements_per_row - cols)
{
result = *optr + _a * result;
- *optr = result;
}
+ *optr = result;
}
result_v4.x = result;
optr -= 3;
- for(int i = 1; i < elements_per_row / 4; ++i, optr -= 4)
+ for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4)
{
// shift left, `offset` is type `size_t` so it cannot be negative
out_v4 = vload4(0, optr);
float4 buf_v4, out_v4, res_v4 = (float4)(0);
float result = 0;
- // we assume elements_per_row is multple of 4
- for(int i = 0; i < 4; ++ i, -- optr, -- bptr)
+ // we assume elements_per_row is multple of WIDTH_MULTIPLE
+ for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr, -- bptr)
{
- if(i < elements_per_row - cols)
- {
- *optr = result;
- }
- else
+ if(i >= elements_per_row - cols)
{
result = *optr + *bptr * result;
- *optr = result;
}
+ *optr = result;
}
res_v4.x = result;
optr -= 3;
bptr -= 3;
- for(int i = 0; i < elements_per_row / 4 - 1; ++i, optr -= 4, bptr -= 4)
+ for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4, bptr -= 4)
{
buf_v4 = vload4(0, bptr);
out_v4 = vload4(0, optr);