void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
{
+ Context *clCxt = src1.clCxt;
+ bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+ if (!hasDouble && src1.depth() == CV_64F)
+ {
+ CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
+ return;
+ }
+
+ CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
- CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
- src1.rows == src2.rows && src2.rows == dst.rows);
- CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
- Context *clCxt = src1.clCxt;
int channels = dst.oclchannels();
int depth = dst.depth();
+ int cols1 = src1.cols * channels;
+ int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
+ int src2step1 = src2.step1(), src2offset1 = src2.offset / src1.elemSize1();
+ int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
- int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
- {4, 0, 4, 4, 4, 4, 4},
- {4, 0, 4, 4, 4, 4, 4},
- {4, 0, 4, 4, 4, 4, 4}
- };
-
-
- size_t vector_length = vector_lengths[channels - 1][depth];
- int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
- int cols = divUp(dst.cols * channels + offset_cols, vector_length);
+ const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+ std::string buildOptions = format("-D T=%s -D WT=%s -D convertToT=convert_%s%s",
+ typeMap[depth], hasDouble ? "double" : "float", typeMap[depth],
+ depth >= CV_32F ? "" : "_sat_rte");
size_t localThreads[3] = { 256, 1, 1 };
- size_t globalThreads[3] = { cols, dst.rows, 1};
+ size_t globalThreads[3] = { cols1, dst.rows, 1};
+
+ float alpha_f = static_cast<float>(alpha),
+ beta_f = static_cast<float>(beta),
+ gama_f = static_cast<float>(gama);
- int dst_step1 = dst.cols * dst.elemSize();
- int src1_step = (int) src1.step;
- int src2_step = (int) src2.step;
- int dst_step = (int) dst.step;
- float alpha_f = alpha, beta_f = beta, gama_f = gama;
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1));
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1));
- if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
- {
- args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
- args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
- args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
- }
- else
+ if (!hasDouble)
{
args.push_back( make_pair( sizeof(cl_float), (void *)&alpha_f ));
args.push_back( make_pair( sizeof(cl_float), (void *)&beta_f ));
args.push_back( make_pair( sizeof(cl_float), (void *)&gama_f ));
}
+ else
+ {
+ args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
+ args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
+ args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
+ }
- args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
- args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
- openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
+ openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads,
+ args, -1, -1, buildOptions.c_str());
}
static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString)
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
+
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
-typedef double F;
-#else
-typedef float F;
#endif
+
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
- __global uchar *src2, int src2_step,int src2_offset,
- F alpha,F beta,F gama,
- __global uchar *dst, int dst_step,int dst_offset,
- int rows, int cols,int dst_step1)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
-
- {
-
- x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- uchar4 src1_data ,src2_data;
-
- src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
- src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
- src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
- src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
- src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
- src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
- src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
- src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
- uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
- short4 tmp;
- tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
- tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
- tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
- tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
- uchar4 tmp_data = convert_uchar4_sat(tmp);
-
- dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
- dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
- dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
- dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
- *((__global uchar4 *)(dst + dst_index)) = dst_data;
- // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
- }
-
-}
-
-
-
-__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
- __global ushort *src2, int src2_step,int src2_offset,
- F alpha,F beta,F gama,
- __global ushort *dst, int dst_step,int dst_offset,
- int rows, int cols,int dst_step1)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
-
- {
-
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
- ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
-
- ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
- // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
- int4 tmp;
- tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
- tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
- tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
- tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
- ushort4 tmp_data = convert_ushort4_sat(tmp);
- dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
- dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
- dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
- dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
- *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
- }
-
-
-}
-
-
-__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
- __global short *src2, int src2_step,int src2_offset,
- F alpha,F beta,F gama,
- __global short *dst, int dst_step,int dst_offset,
- int rows, int cols,int dst_step1)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
-
- {
-
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
-
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
- short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
- if(src1_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
- short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
- // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
- int4 tmp;
- tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
- tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
- tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
- tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
- short4 tmp_data = convert_short4_sat(tmp);
- dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
- dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
- dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
- dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
- *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
- }
-
-}
-
-__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
- __global int *src2, int src2_step,int src2_offset,
- F alpha,F beta, F gama,
- __global int *dst, int dst_step,int dst_offset,
- int rows, int cols,int dst_step1)
+__kernel void addWeighted(__global T * src1, int src1_step1, int src1_offset1,
+ __global T * src2, int src2_step1, int src2_offset1,
+ __global T * dst, int dst_step1, int dst_offset1,
+ WT alpha, WT beta, WT gama,
+ int cols1, int rows)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if (x < cols && y < rows)
-
+ if (x < cols1 && y < rows)
{
+ int src1_index = mad24(y, src1_step1, x + src1_offset1);
+ int src2_index = mad24(y, src2_step1, x + src2_offset1);
+ int dst_index = mad24(y, dst_step1, x + dst_offset1);
- x = x << 2;
-
-#define bitOfInt (sizeof(int)== 4 ? 2: 3)
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> bitOfInt) & 3)
-
- int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
- int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
-
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
- int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
- int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-
- if(src1_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
- int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
- // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
- float4 tmp;
- tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
- tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
- tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
- tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
- int4 tmp_data = convert_int4_sat(tmp);
-
- dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
- dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
- dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
- dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
- *((__global int4 *)((__global char *)dst + dst_index)) = dst_data;
+ dst[dst_index] = convertToT(src1[src1_index]*alpha + src2[src2_index]*beta + gama);
}
-
}
-
-
-__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
- __global float *src2, int src2_step,int src2_offset,
- F alpha,F beta, F gama,
- __global float *dst, int dst_step,int dst_offset,
- int rows, int cols,int dst_step1)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
-
- {
-
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
- float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
- if(src1_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
- // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-
- // float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
- float4 tmp_data;
- tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
- tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
- tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
- tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
- // float4 tmp_data = convert_float4(tmp);
-
- dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
- dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
- dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
- dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
- *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
- }
-
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
- __global double *src2, int src2_step,int src2_offset,
- F alpha,F beta, F gama,
- __global double *dst, int dst_step,int dst_offset,
- int rows, int cols,int dst_step1)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
-
- {
-
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
-
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
- double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
- double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
- if(src1_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
- // double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
- double4 tmp_data;
- tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
- tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
- tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
- tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-
- dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
- dst_data.y = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.y : dst_data.y;
- dst_data.z = ((dst_index + 16 >= dst_start) && (dst_index + 16 < dst_end)) ? tmp_data.z : dst_data.z;
- dst_data.w = ((dst_index + 24 >= dst_start) && (dst_index + 24 < dst_end)) ? tmp_data.w : dst_data.w;
-
- *((__global double4 *)((__global char *)dst + dst_index)) = dst_data;
- }
-
-}
-#endif