+++ /dev/null
-__kernel void
-CopyBuffer(
- __global float* src,
- __global float* dst )
-{
- int id = (int)get_global_id(0);
- dst[id] = src[id];
-}
-
+++ /dev/null
-GEN6 := $(patsubst %.cl,%.gen6.bin,$(wildcard *.cl))
-GEN7 := $(patsubst %.cl,%.gen7.bin,$(wildcard *.cl))
-GEN75 := $(patsubst %.cl,%.gen75.bin,$(wildcard *.cl))
-
-%.gen6.bin: %.cl
- wine ../bin/gen6/TC_Tester.exe -f $< -dev gen6 -in cl_text -out dev_bin
-
-%.gen7.bin: %.cl
- wine ../bin/TC_Tester.exe -f $< -dev gen7 -in cl_text -out dev_bin
-
-%.gen75.bin: %.cl
- wine ../bin/TC_Tester.exe -f $< -dev gen75 -in cl_text -out dev_bin
-
-clean75:
- rm -rf ./gen75/*bin
-
-clean7:
- rm -rf ./gen7/*bin
-
-clean6:
- rm -rf ./gen6/*bin
-
-gen6: $(GEN6)
- mv *bin ./gen6
- rm -rf *ghal3d *ll *krn
-
-gen7: $(GEN7) clean7
- mv *bin ./gen7
- rm -rf *ghal3d *ll *krn *krn7 *_0.cl
-
-gen75: $(GEN75) clean75
- mv *bin ./gen75
- rm -rf *ghal3d *ll *krn *krn75 *_0.cl
-
+++ /dev/null
-/**
- *
- */
-
-unsigned char
-galoisMultiplication(unsigned char a, unsigned char b)
-{
- unsigned char p = 0;
- for(unsigned int i=0; i < 8; ++i)
- {
- if((b&1) == 1)
- {
- p^=a;
- }
- unsigned char hiBitSet = (a & 0x80);
- a <<= 1;
- if(hiBitSet == 0x80)
- {
- a ^= 0x1b;
- }
- b >>= 1;
- }
- return p;
-}
-
-inline uchar4
-sbox(__global uchar * SBox, uchar4 block)
-{
- return (uchar4)(SBox[block.x], SBox[block.y], SBox[block.z], SBox[block.w]);
-}
-
-uchar4
-mixColumns(__local uchar4 * block, __private uchar4 * galiosCoeff, unsigned int j)
-{
- unsigned int bw = 4;
-
- uchar x, y, z, w;
-
- x = galoisMultiplication(block[0].x, galiosCoeff[(bw-j)%bw].x);
- y = galoisMultiplication(block[0].y, galiosCoeff[(bw-j)%bw].x);
- z = galoisMultiplication(block[0].z, galiosCoeff[(bw-j)%bw].x);
- w = galoisMultiplication(block[0].w, galiosCoeff[(bw-j)%bw].x);
-
- for(unsigned int k=1; k< 4; ++k)
- {
- x ^= galoisMultiplication(block[k].x, galiosCoeff[(k+bw-j)%bw].x);
- y ^= galoisMultiplication(block[k].y, galiosCoeff[(k+bw-j)%bw].x);
- z ^= galoisMultiplication(block[k].z, galiosCoeff[(k+bw-j)%bw].x);
- w ^= galoisMultiplication(block[k].w, galiosCoeff[(k+bw-j)%bw].x);
- }
-
- return (uchar4)(x, y, z, w);
-}
-
-uchar4
-shiftRows(uchar4 row, unsigned int j)
-{
- uchar4 r = row;
- for(uint i=0; i < j; ++i)
- {
- r = r.yzwx;
- }
- return r;
-}
-
-__kernel
-void AESEncrypt(__global uchar4 * output ,
- __global uchar4 * input ,
- __global uchar4 * roundKey,
- __global uchar * SBox ,
- __local uchar4 * block0 ,
- __local uchar4 * block1 ,
- const uint width ,
- const uint rounds )
-
-{
- unsigned int blockIdx = get_group_id(0);
- unsigned int blockIdy = get_group_id(1);
-
- unsigned int localIdx = get_local_id(0);
- unsigned int localIdy = get_local_id(1);
-
- unsigned int globalIndex = (((blockIdy * width/4) + blockIdx) * 4 )+ (localIdy);
- unsigned int localIndex = localIdy;
-
- __private uchar4 galiosCoeff[4];
- galiosCoeff[0] = (uchar4)(2, 0, 0, 0);
- galiosCoeff[1] = (uchar4)(3, 0, 0, 0);
- galiosCoeff[2] = (uchar4)(1, 0, 0, 0);
- galiosCoeff[3] = (uchar4)(1, 0, 0, 0);
-
- block0[localIndex] = input[globalIndex];
-
- block0[localIndex] ^= roundKey[localIndex];
-
- for(unsigned int r=1; r < rounds; ++r)
- {
- block0[localIndex] = sbox(SBox, block0[localIndex]);
-
- block0[localIndex] = shiftRows(block0[localIndex], localIndex);
-
- barrier(CLK_LOCAL_MEM_FENCE);
- block1[localIndex] = mixColumns(block0, galiosCoeff, localIndex);
-
- barrier(CLK_LOCAL_MEM_FENCE);
- block0[localIndex] = block1[localIndex]^roundKey[r*4 + localIndex];
- }
- block0[localIndex] = sbox(SBox, block0[localIndex]);
-
- block0[localIndex] = shiftRows(block0[localIndex], localIndex);
-
- output[globalIndex] = block0[localIndex]^roundKey[(rounds)*4 + localIndex];
-}
-
-uchar4
-shiftRowsInv(uchar4 row, unsigned int j)
-{
- uchar4 r = row;
- for(uint i=0; i < j; ++i)
- {
- r = r.wxyz;
- }
- return r;
-}
-
-__kernel
-void AESDecrypt(__global uchar4 * output ,
- __global uchar4 * input ,
- __global uchar4 * roundKey ,
- __global uchar * SBox ,
- __local uchar4 * block0 ,
- __local uchar4 * block1 ,
- const uint width ,
- const uint rounds )
-
-{
- unsigned int blockIdx = get_group_id(0);
- unsigned int blockIdy = get_group_id(1);
-
- unsigned int localIdx = get_local_id(0);
- unsigned int localIdy = get_local_id(1);
-
- unsigned int globalIndex = (((blockIdy * width/4) + blockIdx) * 4 )+ (localIdy);
- unsigned int localIndex = localIdy;
-
- __private uchar4 galiosCoeff[4];
- galiosCoeff[0] = (uchar4)(14, 0, 0, 0);
- galiosCoeff[1] = (uchar4)(11, 0, 0, 0);
- galiosCoeff[2] = (uchar4)(13, 0, 0, 0);
- galiosCoeff[3] = (uchar4)(9, 0, 0, 0);
-
- block0[localIndex] = input[globalIndex];
-
- block0[localIndex] ^= roundKey[4*rounds + localIndex];
-
- for(unsigned int r=rounds -1 ; r > 0; --r)
- {
- block0[localIndex] = shiftRowsInv(block0[localIndex], localIndex);
-
- block0[localIndex] = sbox(SBox, block0[localIndex]);
-
- barrier(CLK_LOCAL_MEM_FENCE);
- block1[localIndex] = block0[localIndex]^roundKey[r*4 + localIndex];
-
- barrier(CLK_LOCAL_MEM_FENCE);
- block0[localIndex] = mixColumns(block1, galiosCoeff, localIndex);
- }
-
- block0[localIndex] = shiftRowsInv(block0[localIndex], localIndex);
-
- block0[localIndex] = sbox(SBox, block0[localIndex]);
-
- output[globalIndex] = block0[localIndex]^roundKey[localIndex];
-}
+++ /dev/null
-/*
- * For a description of the algorithm and the terms used, please see the
- * documentation for this sample.
- *
- * Each group invocation of this kernel, calculates the option value for a
- * given stoke price, option strike price, time to expiration date, risk
- * free interest and volatility factor.
- *
- * Multiple groups calculate the same with different input values.
- * Number of work-items in each group is same as number of leaf nodes
- * So, the maximum number of steps is limited by loca memory size available
- *
- * Each work-item calculate the leaf-node value and update local memory.
- * These leaf nodes are further reduced to the root of the tree (time step 0).
- */
-
-#define RISKFREE 0.02f
-#define VOLATILITY 0.30f
-
-__kernel
-void
-binomial_options(
- int numSteps,
- const __global float4* randArray,
- __global float4* output,
- __local float4* callA,
- __local float4* callB)
-{
- // load shared mem
- unsigned int tid = get_local_id(0);
- unsigned int bid = get_group_id(0);
-
- float4 inRand = randArray[bid];
-
- float4 s = (1.0f - inRand) * 5.0f + inRand * 30.f;
- float4 x = (1.0f - inRand) * 1.0f + inRand * 100.f;
- float4 optionYears = (1.0f - inRand) * 0.25f + inRand * 10.f;
- float4 dt = optionYears * (1.0f / (float)numSteps);
- float4 vsdt = VOLATILITY * sqrt(dt);
- float4 rdt = RISKFREE * dt;
- float4 r = exp(rdt);
- float4 rInv = 1.0f / r;
- float4 u = exp(vsdt);
- float4 d = 1.0f / u;
- float4 pu = (r - d)/(u - d);
- float4 pd = 1.0f - pu;
- float4 puByr = pu * rInv;
- float4 pdByr = pd * rInv;
-
- float4 profit = s * exp(vsdt * (2.0f * tid - (float)numSteps)) - x;
- callA[tid].x = profit.x > 0 ? profit.x : 0.0f;
- callA[tid].y = profit.y > 0 ? profit.y : 0.0f;
- callA[tid].z = profit.z > 0 ? profit.z: 0.0f;
- callA[tid].w = profit.w > 0 ? profit.w: 0.0f;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- for(int j = numSteps; j > 0; j -= 2)
- {
- if(tid < j)
- {
- callB[tid] = puByr * callA[tid] + pdByr * callA[tid + 1];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(tid < j - 1)
- {
- callA[tid] = puByr * callB[tid] + pdByr * callB[tid + 1];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- // write result for this block to global mem
- if(tid == 0) output[bid] = callA[0];
-}
+++ /dev/null
-/*
- * For a description of the algorithm and the terms used, please see the
- * documentation for this sample.
- *
- * One invocation of this kernel, i.e one work thread writes two output values.
- * Since every pass of this algorithm does width/2 comparisons, each compare
- * operation is done by one work thread.
- *
- * Depending of the direction of sort for the work thread, the output values
- * are written either as greater value to left element or lesser value to the
- * left element. Right element and left element are the two elements we are
- * comparing and "left" is the element with a smaller index into the array.
- *
- * if direction is CL_TRUE, i.e evaluates to non zero, it means "increasing".
- *
- * For an explanation of the terms "blockWidth", "sameDirectionBlockWidth",
- * stage, pass, pairDistance please go through the document shipped with this
- * sample.
- *
- * Since an explanation of the terms and the code here would be quite lengthy,
- * confusing and will greatly reduce the readability of this kernel, the code
- * has been explained in detail in the document mentioned above.
- */
-
-__kernel
-void bitonicSort(__global uint * theArray,
- const uint stage,
- const uint passOfStage,
- const uint width,
- const uint direction)
-{
- uint sortIncreasing = direction;
- uint threadId = get_global_id(0);
-
- uint pairDistance = 1 << (stage - passOfStage);
- uint blockWidth = 2 * pairDistance;
-
- uint leftId = (threadId % pairDistance)
- + (threadId / pairDistance) * blockWidth;
-
- uint rightId = leftId + pairDistance;
-
- uint leftElement = theArray[leftId];
- uint rightElement = theArray[rightId];
-
- uint sameDirectionBlockWidth = 1 << stage;
-
- if((threadId/sameDirectionBlockWidth) % 2 == 1)
- sortIncreasing = 1 - sortIncreasing;
-
- uint greater;
- uint lesser;
- if(leftElement > rightElement)
- {
- greater = leftElement;
- lesser = rightElement;
- }
- else
- {
- greater = rightElement;
- lesser = leftElement;
- }
-
- if(sortIncreasing)
- {
- theArray[leftId] = lesser;
- theArray[rightId] = greater;
- }
- else
- {
- theArray[leftId] = greater;
- theArray[rightId] = lesser;
- }
-}
+++ /dev/null
-/*
- * For a description of the algorithm and the terms used, please see the
- * documentation for this sample.
- *
- * On invocation of kernel blackScholes, each work thread calculates call price
- * and put price values for given stoke price, option strike price,
- * time to expiration date, risk free interest and volatility factor.
- */
-
-#define S_LOWER_LIMIT 10.0f
-#define S_UPPER_LIMIT 100.0f
-#define K_LOWER_LIMIT 10.0f
-#define K_UPPER_LIMIT 100.0f
-#define T_LOWER_LIMIT 1.0f
-#define T_UPPER_LIMIT 10.0f
-#define R_LOWER_LIMIT 0.01f
-#define R_UPPER_LIMIT 0.05f
-#define SIGMA_LOWER_LIMIT 0.01f
-#define SIGMA_UPPER_LIMIT 0.10f
-
-/**
- * @brief Abromowitz Stegun approxmimation for PHI (Cumulative Normal Distribution Function)
- * @param X input value
- * @param phi pointer to store calculated CND of X
- */
-void phi(float4 X, float4* phi)
-{
- float4 y;
- float4 absX;
- float4 t;
- float4 result;
-
- const float4 c1 = (float4)0.319381530f;
- const float4 c2 = (float4)-0.356563782f;
- const float4 c3 = (float4)1.781477937f;
- const float4 c4 = (float4)-1.821255978f;
- const float4 c5 = (float4)1.330274429f;
-
- const float4 zero = (float4)0.0f;
- const float4 one = (float4)1.0f;
- const float4 two = (float4)2.0f;
- const float4 temp4 = (float4)0.2316419f;
-
- const float4 oneBySqrt2pi = (float4)0.398942280f;
-
- absX = fabs(X);
- t = one/(one + temp4 * absX);
-
- y = one - oneBySqrt2pi * exp(-X*X/two) * t
- * (c1 + t
- * (c2 + t
- * (c3 + t
- * (c4 + t * c5))));
-
- result = (X < zero)? (one - y) : y;
-
- *phi = result;
-}
-
-/*
- * @brief Calculates the call and put prices by using Black Scholes model
- * @param s Array of random values of current option price
- * @param sigma Array of random values sigma
- * @param k Array of random values strike price
- * @param t Array of random values of expiration time
- * @param r Array of random values of risk free interest rate
- * @param width Width of call price or put price array
- * @param call Array of calculated call price values
- * @param put Array of calculated put price values
- */
-__kernel
-void
-blackScholes(const __global float4 *randArray,
- int width,
- __global float4 *call,
- __global float4 *put)
-{
- float4 d1, d2;
- float4 phiD1, phiD2;
- float4 sigmaSqrtT;
- float4 KexpMinusRT;
-
- size_t xPos = get_global_id(0);
- size_t yPos = get_global_id(1);
- float4 two = (float4)2.0f;
- float4 inRand = randArray[yPos * width + xPos];
- float4 S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
- float4 K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
- float4 T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
- float4 R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
- float4 sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);
-
-
- sigmaSqrtT = sigmaVal * sqrt(T);
-
- d1 = (log(S/K) + (R + sigmaVal * sigmaVal / two)* T)/ sigmaSqrtT;
- d2 = d1 - sigmaSqrtT;
-
- KexpMinusRT = K * exp(-R * T);
- phi(d1, &phiD1), phi(d2, &phiD2);
- call[yPos * width + xPos] = S * phiD1 - KexpMinusRT * phiD2;
- phi(-d1, &phiD1), phi(-d2, &phiD2);
- put[yPos * width + xPos] = KexpMinusRT * phiD2 - S * phiD1;
-}
-
+++ /dev/null
-#!/bin/bash
-clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1 -o $1.o
-llvm-dis $1.o
-rm $1.o
-mv $1.o.ll $1.ll
-
+++ /dev/null
-/**
- * Given the blockindices and localIndicies this
- * function calculate the global index
- * @param blockIdx index of the block horizontally
- * @param blockIdy index of the block vertically
- * @param localidx index of the element relative to the block horizontally
- * @param localIdy index of the element relative to the block vertically
- * @param blockWidth width of each blcok which is 8
- * @param globalWidth Width of the input matrix
- */
-uint
-getIdx(uint blockIdx, uint blockIdy, uint localIdx, uint localIdy, uint blockWidth, uint globalWidth)
-{
- uint globalIdx = blockIdx * blockWidth + localIdx;
- uint globalIdy = blockIdy * blockWidth + localIdy;
-
- return (globalIdy * globalWidth + globalIdx);
-}
-/**
- * Perform Discrete Cosine Transform for block of size 8x8
- * in the input matrix
- * @param output output of the DCT8x8 transform
- * @param input input array
- * @param dct8x8 8x8 consine function base used to calculate DCT8x8
- * @param inter local memory which stores intermediate result
- * @param width width of the input matrix
- * @param blockWidth width of each block, 8 here
- * @param inverse flag to perform inverse DCT
- */
-__kernel
-void DCT(__global float * output,
- __global float * input,
- __global float * dct8x8,
- __local float * inter,
- const uint width,
- const uint blockWidth,
- const uint inverse)
-
-{
- /* get global indices of the element */
- uint globalIdx = get_global_id(0);
- uint globalIdy = get_global_id(1);
-
- /* get indices of the block to which the element belongs to */
- uint groupIdx = get_group_id(0);
- uint groupIdy = get_group_id(1);
-
- /* get indices relative to the block */
- uint i = get_local_id(0);
- uint j = get_local_id(1);
-
- uint idx = globalIdy * width + globalIdx;
-
- /* initialise the accumulator */
- float acc = 0.0f;
-
- /* AT * X */
- for(uint k=0; k < blockWidth; k++)
- {
- uint index1 = (inverse)? i*blockWidth + k : k * blockWidth + i;
- uint index2 = getIdx(groupIdx, groupIdy, j, k, blockWidth, width);
- acc += dct8x8[index1] * input[index2];
- }
-
- inter[j*blockWidth + i] = acc;
- /*
- * Make sure all the values of inter that belong to a block
- * are calculated before proceeding further
- */
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* again initalising the accumulator */
- acc = 0.0f;
-
- /* (AT * X) * A */
- for(uint k=0; k < blockWidth; k++)
- {
- uint index1 = i* blockWidth + k;
- uint index2 = (inverse)? j*blockWidth + k : k* blockWidth + j;
- acc += inter[index1] * dct8x8[index2];
- }
-
- output[idx] = acc;
-}
-
+++ /dev/null
-; ModuleID = 'void.cl.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
-define ptx_kernel void @hop() nounwind readnone noinline {
-entry:
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void ()* @hop}
+++ /dev/null
-/*
- */
-
-__kernel
-void fastWalshTransform(__global float * tArray,
- __const int step
- )
-{
- unsigned int tid = get_global_id(0);
-
- const unsigned int group = tid%step;
- const unsigned int pair = 2*step*(tid/step) + group;
-
- const unsigned int match = pair + step;
-
- float T1 = tArray[pair];
- float T2 = tArray[match];
-
- tArray[pair] = T1 + T2;
- tArray[match] = T1 - T2;
-}
+++ /dev/null
-
-// This is 2 PI / 1024
-#define ANGLE 0x1.921fb6p-8F
-
-// Return sin and cos of -2*pi*i/1024
-__attribute__((always_inline)) float
-k_sincos(int i, float *cretp)
-{
- if (i > 512)
- i -= 1024;
-
- float x = i * -ANGLE;
- *cretp = cos(x);
- return sin(x);
-}
-
-__attribute__((always_inline)) float4
-k_sincos4(int4 i, float4 *cretp)
-{
- i -= (i > 512) & 1024;
- float4 x = convert_float4(i) * -ANGLE;
- *cretp = cos(x);
- return sin(x);
-}
-
-// Twiddle factor stuff
-#define TWGEN(I,C,S) \
- float C; \
- float S = k_sincos(tbase * I, &C)
-
-#define TW4GEN(I,C,S) \
- float4 C; \
- float4 S = k_sincos4(tbase * I, &C)
-
-#define TWAPPLY(ZR, ZI, C, S) \
- do { \
- float4 __r = C * ZR - S * ZI; \
- ZI = C * ZI + S * ZR; \
- ZR = __r; \
- } while (0)
-
-# define TW4IDDLE4() \
- do { \
- TW4GEN(1, c1, s1); \
- TWAPPLY(zr1, zi1, c1, s1); \
- TW4GEN(2, c2, s2); \
- TWAPPLY(zr2, zi2, c2, s2); \
- TW4GEN(3, c3, s3); \
- TWAPPLY(zr3, zi3, c3, s3); \
- } while (0)
-
-# define TWIDDLE4() \
- do { \
- TWGEN(1, c1, s1); \
- TWAPPLY(zr1, zi1, c1, s1); \
- TWGEN(2, c2, s2); \
- TWAPPLY(zr2, zi2, c2, s2); \
- TWGEN(3, c3, s3); \
- TWAPPLY(zr3, zi3, c3, s3); \
- } while (0)
-
-// 4 point FFT
-#define FFT4() \
- do { \
- float4 ar0 = zr0 + zr2; \
- float4 ar2 = zr1 + zr3; \
- float4 br0 = ar0 + ar2; \
- float4 br1 = zr0 - zr2; \
- float4 br2 = ar0 - ar2; \
- float4 br3 = zr1 - zr3; \
- float4 ai0 = zi0 + zi2; \
- float4 ai2 = zi1 + zi3; \
- float4 bi0 = ai0 + ai2; \
- float4 bi1 = zi0 - zi2; \
- float4 bi2 = ai0 - ai2; \
- float4 bi3 = zi1 - zi3; \
- zr0 = br0; \
- zi0 = bi0; \
- zr1 = br1 + bi3; \
- zi1 = bi1 - br3; \
- zr3 = br1 - bi3; \
- zi3 = br3 + bi1; \
- zr2 = br2; \
- zi2 = bi2; \
- } while (0)
-
-// First pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass1(uint me,
- const __global float *gr, const __global float *gi,
- __local float *lds)
-{
- const __global float4 *gp;
- __local float *lp;
-
- // Pull in transform data
- gp = (const __global float4 *)(gr + (me << 2));
- float4 zr0 = gp[0*64];
- float4 zr1 = gp[1*64];
- float4 zr2 = gp[2*64];
- float4 zr3 = gp[3*64];
-
- gp = (const __global float4 *)(gi + (me << 2));
- float4 zi0 = gp[0*64];
- float4 zi1 = gp[1*64];
- float4 zi2 = gp[2*64];
- float4 zi3 = gp[3*64];
-
- FFT4();
-
- int4 tbase = (int)(me << 2) + (int4)(0, 1, 2, 3);
- TW4IDDLE4();
-
- // Save registers
- // Note that this pointer is not aligned enough to be cast to a float4*
- lp = lds + ((me << 2) + (me >> 3));
-
- lp[0] = zr0.x;
- lp[1] = zr0.y;
- lp[2] = zr0.z;
- lp[3] = zr0.w;
- lp += 66*4;
-
- lp[0] = zr1.x;
- lp[1] = zr1.y;
- lp[2] = zr1.z;
- lp[3] = zr1.w;
- lp += 66*4;
-
- lp[0] = zr2.x;
- lp[1] = zr2.y;
- lp[2] = zr2.z;
- lp[3] = zr2.w;
- lp += 66*4;
-
- lp[0] = zr3.x;
- lp[1] = zr3.y;
- lp[2] = zr3.z;
- lp[3] = zr3.w;
- lp += 66*4;
-
- // Imaginary part
- lp[0] = zi0.x;
- lp[1] = zi0.y;
- lp[2] = zi0.z;
- lp[3] = zi0.w;
- lp += 66*4;
-
- lp[0] = zi1.x;
- lp[1] = zi1.y;
- lp[2] = zi1.z;
- lp[3] = zi1.w;
- lp += 66*4;
-
- lp[0] = zi2.x;
- lp[1] = zi2.y;
- lp[2] = zi2.z;
- lp[3] = zi2.w;
- lp += 66*4;
-
- lp[0] = zi3.x;
- lp[1] = zi3.y;
- lp[2] = zi3.z;
- lp[3] = zi3.w;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-// Second pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass2(uint me, __local float *lds)
-{
- __local float *lp;
-
- // Load registers
- lp = lds + (me + (me >> 5));
-
- float4 zr0, zr1, zr2, zr3;
-
- zr0.x = lp[0*66];
- zr1.x = lp[1*66];
- zr2.x = lp[2*66];
- zr3.x = lp[3*66];
- lp += 66*4;
-
- zr0.y = lp[0*66];
- zr1.y = lp[1*66];
- zr2.y = lp[2*66];
- zr3.y = lp[3*66];
- lp += 66*4;
-
- zr0.z = lp[0*66];
- zr1.z = lp[1*66];
- zr2.z = lp[2*66];
- zr3.z = lp[3*66];
- lp += 66*4;
-
- zr0.w = lp[0*66];
- zr1.w = lp[1*66];
- zr2.w = lp[2*66];
- zr3.w = lp[3*66];
- lp += 66*4;
-
- float4 zi0, zi1, zi2, zi3;
-
- zi0.x = lp[0*66];
- zi1.x = lp[1*66];
- zi2.x = lp[2*66];
- zi3.x = lp[3*66];
- lp += 66*4;
-
- zi0.y = lp[0*66];
- zi1.y = lp[1*66];
- zi2.y = lp[2*66];
- zi3.y = lp[3*66];
- lp += 66*4;
-
- zi0.z = lp[0*66];
- zi1.z = lp[1*66];
- zi2.z = lp[2*66];
- zi3.z = lp[3*66];
- lp += 66*4;
-
- zi0.w = lp[0*66];
- zi1.w = lp[1*66];
- zi2.w = lp[2*66];
- zi3.w = lp[3*66];
-
- // Transform and twiddle
- FFT4();
-
- int tbase = (int)(me << 2);
- TWIDDLE4();
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Store registers
- lp = lds + ((me << 2) + (me >> 3));
-
- lp[0] = zr0.x;
- lp[1] = zr1.x;
- lp[2] = zr2.x;
- lp[3] = zr3.x;
- lp += 66*4;
-
- lp[0] = zr0.y;
- lp[1] = zr1.y;
- lp[2] = zr2.y;
- lp[3] = zr3.y;
- lp += 66*4;
-
- lp[0] = zr0.z;
- lp[1] = zr1.z;
- lp[2] = zr2.z;
- lp[3] = zr3.z;
- lp += 66*4;
-
- lp[0] = zr0.w;
- lp[1] = zr1.w;
- lp[2] = zr2.w;
- lp[3] = zr3.w;
- lp += 66*4;
-
- // Imaginary part
- lp[0] = zi0.x;
- lp[1] = zi1.x;
- lp[2] = zi2.x;
- lp[3] = zi3.x;
- lp += 66*4;
-
- lp[0] = zi0.y;
- lp[1] = zi1.y;
- lp[2] = zi2.y;
- lp[3] = zi3.y;
- lp += 66*4;
-
- lp[0] = zi0.z;
- lp[1] = zi1.z;
- lp[2] = zi2.z;
- lp[3] = zi3.z;
- lp += 66*4;
-
- lp[0] = zi0.w;
- lp[1] = zi1.w;
- lp[2] = zi2.w;
- lp[3] = zi3.w;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-// Third pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass3(uint me, __local float *lds)
-{
- __local float *lp;
-
- // Load registers
- lp = lds + (me + (me >> 5));
-
- float4 zr0, zr1, zr2, zr3;
-
- zr0.x = lp[0*66];
- zr1.x = lp[1*66];
- zr2.x = lp[2*66];
- zr3.x = lp[3*66];
- lp += 66*4;
-
- zr0.y = lp[0*66];
- zr1.y = lp[1*66];
- zr2.y = lp[2*66];
- zr3.y = lp[3*66];
- lp += 66*4;
-
- zr0.z = lp[0*66];
- zr1.z = lp[1*66];
- zr2.z = lp[2*66];
- zr3.z = lp[3*66];
- lp += 66*4;
-
- zr0.w = lp[0*66];
- zr1.w = lp[1*66];
- zr2.w = lp[2*66];
- zr3.w = lp[3*66];
- lp += 66*4;
-
- float4 zi0, zi1, zi2, zi3;
-
- zi0.x = lp[0*66];
- zi1.x = lp[1*66];
- zi2.x = lp[2*66];
- zi3.x = lp[3*66];
- lp += 66*4;
-
- zi0.y = lp[0*66];
- zi1.y = lp[1*66];
- zi2.y = lp[2*66];
- zi3.y = lp[3*66];
- lp += 66*4;
-
- zi0.z = lp[0*66];
- zi1.z = lp[1*66];
- zi2.z = lp[2*66];
- zi3.z = lp[3*66];
- lp += 66*4;
-
- zi0.w = lp[0*66];
- zi1.w = lp[1*66];
- zi2.w = lp[2*66];
- zi3.w = lp[3*66];
-
- // Transform and twiddle
- FFT4();
-
- int tbase = (int)((me >> 2) << 4);
- TWIDDLE4();
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Save registers
- lp = lds + me;
-
- lp[0*66] = zr0.x;
- lp[1*66] = zr0.y;
- lp[2*66] = zr0.z;
- lp[3*66] = zr0.w;
- lp += 66*4;
-
- lp[0*66] = zr1.x;
- lp[1*66] = zr1.y;
- lp[2*66] = zr1.z;
- lp[3*66] = zr1.w;
- lp += 66*4;
-
- lp[0*66] = zr2.x;
- lp[1*66] = zr2.y;
- lp[2*66] = zr2.z;
- lp[3*66] = zr2.w;
- lp += 66*4;
-
- lp[0*66] = zr3.x;
- lp[1*66] = zr3.y;
- lp[2*66] = zr3.z;
- lp[3*66] = zr3.w;
- lp += 66*4;
-
- // Imaginary part
- lp[0*66] = zi0.x;
- lp[1*66] = zi0.y;
- lp[2*66] = zi0.z;
- lp[3*66] = zi0.w;
- lp += 66*4;
-
- lp[0*66] = zi1.x;
- lp[1*66] = zi1.y;
- lp[2*66] = zi1.z;
- lp[3*66] = zi1.w;
- lp += 66*4;
-
- lp[0*66] = zi2.x;
- lp[1*66] = zi2.y;
- lp[2*66] = zi2.z;
- lp[3*66] = zi2.w;
- lp += 66*4;
-
- lp[0*66] = zi3.x;
- lp[1*66] = zi3.y;
- lp[2*66] = zi3.z;
- lp[3*66] = zi3.w;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-// Fourth pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass4(uint me, __local float *lds)
-{
- __local float *lp;
-
- // Load registers
- lp = lds + ((me & 0x3) + ((me >> 2) & 0x3)*(66*4) + ((me >> 4) << 2));
-
- float4 zr0, zr1, zr2, zr3;
-
- zr0.x = lp[0*66];
- zr0.y = lp[1*66];
- zr0.z = lp[2*66];
- zr0.w = lp[3*66];
- lp += 16;
-
- zr1.x = lp[0*66];
- zr1.y = lp[1*66];
- zr1.z = lp[2*66];
- zr1.w = lp[3*66];
- lp += 16;
-
- zr2.x = lp[0*66];
- zr2.y = lp[1*66];
- zr2.z = lp[2*66];
- zr2.w = lp[3*66];
- lp += 16;
-
- zr3.x = lp[0*66];
- zr3.y = lp[1*66];
- zr3.z = lp[2*66];
- zr3.w = lp[3*66];
- lp += 66*4*4 - 3*16;
-
- float4 zi0, zi1, zi2, zi3;
-
- zi0.x = lp[0*66];
- zi0.y = lp[1*66];
- zi0.z = lp[2*66];
- zi0.w = lp[3*66];
- lp += 16;
-
- zi1.x = lp[0*66];
- zi1.y = lp[1*66];
- zi1.z = lp[2*66];
- zi1.w = lp[3*66];
- lp += 16;
-
- zi2.x = lp[0*66];
- zi2.y = lp[1*66];
- zi2.z = lp[2*66];
- zi2.w = lp[3*66];
- lp += 16;
-
- zi3.x = lp[0*66];
- zi3.y = lp[1*66];
- zi3.z = lp[2*66];
- zi3.w = lp[3*66];
-
- // Transform and twiddle
- FFT4();
-
- int tbase = (int)((me >> 4) << 6);
- TWIDDLE4();
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Save registers in conflict free manner
- lp = lds + me;
-
- lp[0*68] = zr0.x;
- lp[1*68] = zr0.y;
- lp[2*68] = zr0.z;
- lp[3*68] = zr0.w;
- lp += 68*4;
-
- lp[0*68] = zr1.x;
- lp[1*68] = zr1.y;
- lp[2*68] = zr1.z;
- lp[3*68] = zr1.w;
- lp += 68*4;
-
- lp[0*68] = zr2.x;
- lp[1*68] = zr2.y;
- lp[2*68] = zr2.z;
- lp[3*68] = zr2.w;
- lp += 68*4;
-
- lp[0*68] = zr3.x;
- lp[1*68] = zr3.y;
- lp[2*68] = zr3.z;
- lp[3*68] = zr3.w;
- lp += 68*4;
-
- // Imaginary part
- lp[0*68] = zi0.x;
- lp[1*68] = zi0.y;
- lp[2*68] = zi0.z;
- lp[3*68] = zi0.w;
- lp += 68*4;
-
- lp[0*68] = zi1.x;
- lp[1*68] = zi1.y;
- lp[2*68] = zi1.z;
- lp[3*68] = zi1.w;
- lp += 68*4;
-
- lp[0*68] = zi2.x;
- lp[1*68] = zi2.y;
- lp[2*68] = zi2.z;
- lp[3*68] = zi2.w;
- lp += 68*4;
-
- lp[0*68] = zi3.x;
- lp[1*68] = zi3.y;
- lp[2*68] = zi3.z;
- lp[3*68] = zi3.w;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-// Fifth and last pass of 1K FFT
-__attribute__((always_inline)) void
-kfft_pass5(uint me,
- const __local float *lds,
- __global float *gr, __global float *gi)
-{
- const __local float *lp;
-
- // Load registers
- lp = lds + ((me & 0xf) + (me >> 4)*(68*4));
-
- float4 zr0, zr1, zr2, zr3;
-
- zr0.x = lp[0*68];
- zr0.y = lp[1*68];
- zr0.z = lp[2*68];
- zr0.w = lp[3*68];
- lp += 16;
-
- zr1.x = lp[0*68];
- zr1.y = lp[1*68];
- zr1.z = lp[2*68];
- zr1.w = lp[3*68];
- lp += 16;
-
- zr2.x = lp[0*68];
- zr2.y = lp[1*68];
- zr2.z = lp[2*68];
- zr2.w = lp[3*68];
- lp += 16;
-
- zr3.x = lp[0*68];
- zr3.y = lp[1*68];
- zr3.z = lp[2*68];
- zr3.w = lp[3*68];
-
- lp += 68*4*4 - 3*16;
-
- float4 zi0, zi1, zi2, zi3;
-
- zi0.x = lp[0*68];
- zi0.y = lp[1*68];
- zi0.z = lp[2*68];
- zi0.w = lp[3*68];
- lp += 16;
-
- zi1.x = lp[0*68];
- zi1.y = lp[1*68];
- zi1.z = lp[2*68];
- zi1.w = lp[3*68];
- lp += 16;
-
- zi2.x = lp[0*68];
- zi2.y = lp[1*68];
- zi2.z = lp[2*68];
- zi2.w = lp[3*68];
- lp += 16;
-
- zi3.x = lp[0*68];
- zi3.y = lp[1*68];
- zi3.z = lp[2*68];
- zi3.w = lp[3*68];
-
- // Transform
- FFT4();
-
- // Save result
- __global float4 *gp = (__global float4 *)(gr + (me << 2));
- gp[0*64] = zr0;
- gp[1*64] = zr1;
- gp[2*64] = zr2;
- gp[3*64] = zr3;
-
- gp = (__global float4 *)(gi + (me << 2));
- gp[0*64] = zi0;
- gp[1*64] = zi1;
- gp[2*64] = zi2;
- gp[3*64] = zi3;
-}
-
-// Distance between first real element of successive 1K vectors
-// It must be >= 1024, and a multiple of 4
-#define VSTRIDE (1024+0)
-
-// Performs a 1K complex FFT with every 64 global ids.
-// Each vector is a multiple of VSTRIDE from the first
-// Number of global ids must be a multiple of 64, e.g. 1024*64
-//
-// greal - pointer to input and output real part of data
-// gimag - pointer to input and output imaginary part of data
-__kernel void
-kfft(__global float *greal, __global float *gimag)
-{
- // This is 8704 bytes
- __local float lds[68*4*4*2];
-
- __global float *gr;
- __global float *gi;
- uint gid = get_global_id(0);
- uint me = gid & 0x3fU;
- uint dg = (gid >> 6) * VSTRIDE;
-
- gr = greal + dg;
- gi = gimag + dg;
-
- kfft_pass1(me, gr, gi, lds);
- kfft_pass2(me, lds);
- kfft_pass3(me, lds);
- kfft_pass4(me, lds);
- kfft_pass5(me, lds, gr, gi);
-}
-
+++ /dev/null
-// Used to index into the 1D array, so that we can use
-// it effectively as a 2D array
-int index(int x, int y, int width) {
- return 4*width*y + x*4;
-}
-
-// Turn the raw x coordinates [0, 1] into a scaled x coordinate
-// [0, 1] -> [-2, 1.25]
-float mapX(float x) {
- return x*3.25 - 2;
-}
-
-// Same purpose as mapX
-// [0, 1] -> [-1.25, 1.25]
-float mapY(float y) {
- return y*2.5 - 1.25;
-}
-
-__kernel void render(__global char *out) {
- int x_dim = get_global_id(0);
- int y_dim = get_global_id(1);
- int width = get_global_size(0);
- int height = get_global_size(1);
- int idx = index(x_dim, y_dim, width);
-
- float x_origin = mapX((float) x_dim / width);
- float y_origin = mapY((float) y_dim / height);
-
- // The Escape time algorithm, it follows the pseduocode from Wikipedia
- // _very_ closely
- float x = 0.0;
- float y = 0.0;
-
- int iteration = 0;
-
- // This can be changed, to be more or less precise
- int max_iteration = 256;
- while(x*x + y*y <= 4 && iteration < max_iteration) {
- float xtemp = x*x - y*y + x_origin;
- y = 2*x*y + y_origin;
- x = xtemp;
- iteration++;
- }
-
- if(iteration == max_iteration) {
- // This coordinate did not escape, so it is in the Mandelbrot set
- out[idx] = 0;
- out[idx + 1] = 0;
- out[idx + 2] = 0;
- out[idx + 3] = 255;
- } else {
- // This coordinate did escape, so color based on quickly it escaped
- out[idx] = iteration;
- out[idx + 1] = iteration;
- out[idx + 2] = iteration;
- out[idx + 3] = 255;
- }
-
-}
+++ /dev/null
-#define TILEX 4
-#define TILEX_SHIFT 2
-#define TILEY 4
-#define TILEY_SHIFT 2
-
-/* Output tile size : 4x4 = Each thread computes 16 float values*/
-/* Required global threads = (widthC / 4, heightC / 4) */
-/* This kernel runs on 7xx and CPU as they don't have hardware local memory */
-__kernel void mmmKernel(__global float4 *matrixA,
- __global float4 *matrixB,
- __global float4* matrixC,
- uint widthA, uint widthB)
-{
- int2 pos = (int2)(get_global_id(0), get_global_id(1));
-
-
- float4 sum0 = (float4)(0);
- float4 sum1 = (float4)(0);
- float4 sum2 = (float4)(0);
- float4 sum3 = (float4)(0);
-
- /* Vectorization of input Matrices reduces their width by a factor of 4 */
- widthB /= 4;
-
- for(int i = 0; i < widthA; i=i+4)
- {
- float4 tempA0 = matrixA[i/4 + (pos.y << TILEY_SHIFT) * (widthA / 4)];
- float4 tempA1 = matrixA[i/4 + ((pos.y << TILEY_SHIFT) + 1) * (widthA / 4)];
- float4 tempA2 = matrixA[i/4 + ((pos.y << TILEY_SHIFT) + 2) * (widthA / 4)];
- float4 tempA3 = matrixA[i/4 + ((pos.y << TILEY_SHIFT) + 3) * (widthA / 4)];
-
- //Matrix B is not transposed
- float4 tempB0 = matrixB[pos.x + i * widthB];
- float4 tempB1 = matrixB[pos.x + (i + 1) * widthB];
- float4 tempB2 = matrixB[pos.x + (i + 2) * widthB];
- float4 tempB3 = matrixB[pos.x + (i + 3) * widthB];
-
- sum0.x += tempA0.x * tempB0.x + tempA0.y * tempB1.x + tempA0.z * tempB2.x + tempA0.w * tempB3.x;
- sum0.y += tempA0.x * tempB0.y + tempA0.y * tempB1.y + tempA0.z * tempB2.y + tempA0.w * tempB3.y;
- sum0.z += tempA0.x * tempB0.z + tempA0.y * tempB1.z + tempA0.z * tempB2.z + tempA0.w * tempB3.z;
- sum0.w += tempA0.x * tempB0.w + tempA0.y * tempB1.w + tempA0.z * tempB2.w + tempA0.w * tempB3.w;
-
- sum1.x += tempA1.x * tempB0.x + tempA1.y * tempB1.x + tempA1.z * tempB2.x + tempA1.w * tempB3.x;
- sum1.y += tempA1.x * tempB0.y + tempA1.y * tempB1.y + tempA1.z * tempB2.y + tempA1.w * tempB3.y;
- sum1.z += tempA1.x * tempB0.z + tempA1.y * tempB1.z + tempA1.z * tempB2.z + tempA1.w * tempB3.z;
- sum1.w += tempA1.x * tempB0.w + tempA1.y * tempB1.w + tempA1.z * tempB2.w + tempA1.w * tempB3.w;
-
- sum2.x += tempA2.x * tempB0.x + tempA2.y * tempB1.x + tempA2.z * tempB2.x + tempA2.w * tempB3.x;
- sum2.y += tempA2.x * tempB0.y + tempA2.y * tempB1.y + tempA2.z * tempB2.y + tempA2.w * tempB3.y;
- sum2.z += tempA2.x * tempB0.z + tempA2.y * tempB1.z + tempA2.z * tempB2.z + tempA2.w * tempB3.z;
- sum2.w += tempA2.x * tempB0.w + tempA2.y * tempB1.w + tempA2.z * tempB2.w + tempA2.w * tempB3.w;
-
- sum3.x += tempA3.x * tempB0.x + tempA3.y * tempB1.x + tempA3.z * tempB2.x + tempA3.w * tempB3.x;
- sum3.y += tempA3.x * tempB0.y + tempA3.y * tempB1.y + tempA3.z * tempB2.y + tempA3.w * tempB3.y;
- sum3.z += tempA3.x * tempB0.z + tempA3.y * tempB1.z + tempA3.z * tempB2.z + tempA3.w * tempB3.z;
- sum3.w += tempA3.x * tempB0.w + tempA3.y * tempB1.w + tempA3.z * tempB2.w + tempA3.w * tempB3.w;
- }
- matrixC[pos.x + ((pos.y << TILEY_SHIFT) + 0) * widthB] = sum0;
- matrixC[pos.x + ((pos.y << TILEY_SHIFT) + 1) * widthB] = sum1;
- matrixC[pos.x + ((pos.y << TILEY_SHIFT) + 2) * widthB] = sum2;
- matrixC[pos.x + ((pos.y << TILEY_SHIFT) + 3) * widthB] = sum3;
-}
-
-
-/* Matrix A is cached into local memory block */
-/* Required global threads = (widthC / 4, heightC / 4) */
-__kernel void mmmKernel_local(__global float4 *matrixA,
- __global float4 *matrixB,
- __global float4* matrixC,
- int widthA,
- __local float4 *blockA)
-{
- int blockPos = get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT); //Should be : localId * (TILEX / 4) (float4)
-
- /* Position of thread will be according to the number of values it writes i.e TILE size */
- int globalPos = get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0);
-
- /* Each thread writes 4 float4s */
- float4 sum0 = (float4)(0);
- float4 sum1 = (float4)(0);
- float4 sum2 = (float4)(0);
- float4 sum3 = (float4)(0);
-
- int temp = widthA / 4;
-
- /* This loop runs for number of blocks of A in horizontal direction */
- for(int i = 0; i < (temp / get_local_size(0)); i++)
- {
- /* Calculate global ids of threads from the particular block to load from matrix A depending on i */
- int globalPosA = i * get_local_size(0) + get_local_id(0) + (get_global_id(1) << TILEY_SHIFT) * temp;
-
- /* Load values in blockA from matrixA */
- blockA[blockPos] = matrixA[globalPosA];
- blockA[blockPos + get_local_size(0)] = matrixA[globalPosA + temp];
- blockA[blockPos + 2 * get_local_size(0)] = matrixA[globalPosA + 2 * temp];
- blockA[blockPos + 3 * get_local_size(0)] = matrixA[globalPosA + 3 * temp];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* Calculate global ids of threads from the particular block to load from matrix B depending on i */
- int globalPosB = get_global_id(0) + ((i * get_local_size(0)) << TILEY_SHIFT) * get_global_size(0);
-
- /* This loop runs for number of threads in horizontal direction in the block of A */
- for(int j = 0; j < get_local_size(0) * 4; j=j+4)
- {
- /* Load 4 float4s from blockA : access patters = strided from local memory */
- float4 tempA0 = blockA[(j >> 2) + get_local_id(1) * TILEY * get_local_size(0)];
- float4 tempA1 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 1) * get_local_size(0)];
- float4 tempA2 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 2) * get_local_size(0)];
- float4 tempA3 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 3) * get_local_size(0)];
-
- /* Load corresponding values from matrixB, access pattern = linear from global memory */
- float4 tempB0 = matrixB[globalPosB + j * get_global_size(0)]; //Should be localId.x * (TILEX / 4)
- float4 tempB1 = matrixB[globalPosB + (j + 1) * get_global_size(0)];
- float4 tempB2 = matrixB[globalPosB + (j + 2) * get_global_size(0)];
- float4 tempB3 = matrixB[globalPosB + (j + 3) * get_global_size(0)];
-
- sum0.x += tempA0.x * tempB0.x + tempA0.y * tempB1.x + tempA0.z * tempB2.x + tempA0.w * tempB3.x;
- sum0.y += tempA0.x * tempB0.y + tempA0.y * tempB1.y + tempA0.z * tempB2.y + tempA0.w * tempB3.y;
- sum0.z += tempA0.x * tempB0.z + tempA0.y * tempB1.z + tempA0.z * tempB2.z + tempA0.w * tempB3.z;
- sum0.w += tempA0.x * tempB0.w + tempA0.y * tempB1.w + tempA0.z * tempB2.w + tempA0.w * tempB3.w;
-
- sum1.x += tempA1.x * tempB0.x + tempA1.y * tempB1.x + tempA1.z * tempB2.x + tempA1.w * tempB3.x;
- sum1.y += tempA1.x * tempB0.y + tempA1.y * tempB1.y + tempA1.z * tempB2.y + tempA1.w * tempB3.y;
- sum1.z += tempA1.x * tempB0.z + tempA1.y * tempB1.z + tempA1.z * tempB2.z + tempA1.w * tempB3.z;
- sum1.w += tempA1.x * tempB0.w + tempA1.y * tempB1.w + tempA1.z * tempB2.w + tempA1.w * tempB3.w;
-
- sum2.x += tempA2.x * tempB0.x + tempA2.y * tempB1.x + tempA2.z * tempB2.x + tempA2.w * tempB3.x;
- sum2.y += tempA2.x * tempB0.y + tempA2.y * tempB1.y + tempA2.z * tempB2.y + tempA2.w * tempB3.y;
- sum2.z += tempA2.x * tempB0.z + tempA2.y * tempB1.z + tempA2.z * tempB2.z + tempA2.w * tempB3.z;
- sum2.w += tempA2.x * tempB0.w + tempA2.y * tempB1.w + tempA2.z * tempB2.w + tempA2.w * tempB3.w;
-
- sum3.x += tempA3.x * tempB0.x + tempA3.y * tempB1.x + tempA3.z * tempB2.x + tempA3.w * tempB3.x;
- sum3.y += tempA3.x * tempB0.y + tempA3.y * tempB1.y + tempA3.z * tempB2.y + tempA3.w * tempB3.y;
- sum3.z += tempA3.x * tempB0.z + tempA3.y * tempB1.z + tempA3.z * tempB2.z + tempA3.w * tempB3.z;
- sum3.w += tempA3.x * tempB0.w + tempA3.y * tempB1.w + tempA3.z * tempB2.w + tempA3.w * tempB3.w;
-
- }
-
- }
- /* Write 16 values to matrixC */
- matrixC[globalPos] = sum0;
- matrixC[globalPos + get_global_size(0)] = sum1;
- matrixC[globalPos + 2 * get_global_size(0)] = sum2;
- matrixC[globalPos + 3 * get_global_size(0)] = sum3;
-
-}
-
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-
-/* Optimized version : Tile : 4x4
- Both matrix A and matrix B are cached into local memory blocks */
-__kernel void mmmKernel_local2(__global float4 *matrixA,
- __global float4 *matrixB,
- __global float4* matrixC,
- int widthA,
- __local float4 *blockA,
- __local float4 *blockB)
-
-{
-
- float4 sum0 = (float4)(0);
- float4 sum1 = (float4)(0);
- float4 sum2 = (float4)(0);
- float4 sum3 = (float4)(0);
-
- int temp = widthA / 4;
-
- /* Calculate blockwise-MMM for each pair of blocks along the common width of matrixA and matrixB */
- for(int i = 0; i < (temp / get_local_size(0)); i++)
- {
- /* Local data for blockA from matrixA */
- /* Right now considering only square matrices so width of C = width of A */
- blockA[get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT)] = matrixA[i * get_local_size(0) + get_local_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0)];
- blockA[get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT) + get_local_size(0)] = matrixA[i * get_local_size(0) + get_local_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + get_global_size(0)];
- blockA[get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT) + 2 * get_local_size(0)] = matrixA[i * get_local_size(0) + get_local_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 2 * get_global_size(0)];
- blockA[get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT) + 3 * get_local_size(0)] = matrixA[i * get_local_size(0) + get_local_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 3 * get_global_size(0)];
-
- /* Local data for blockA from matrixB */
- blockB[get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT)] = matrixB[get_global_id(0) + ((i * get_local_size(1) + get_local_id(1)) << TILEY_SHIFT) * get_global_size(0)];
- blockB[get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT) + get_local_size(0)] = matrixB[get_global_id(0) + ((i * get_local_size(1) + get_local_id(1)) << TILEY_SHIFT) * get_global_size(0) + get_global_size(0)];
- blockB[get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT) + 2 * get_local_size(0)] = matrixB[get_global_id(0) + ((i * get_local_size(1) + get_local_id(1)) << TILEY_SHIFT) * get_global_size(0) + 2 * get_global_size(0)];
- blockB[get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT) + 3 * get_local_size(0)] = matrixB[get_global_id(0) + ((i * get_local_size(1) + get_local_id(1)) << TILEY_SHIFT) * get_global_size(0) + 3 * get_global_size(0)];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* Each local thread will read a float4 */
- for(int j = 0; j < get_local_size(0) << 2; j=j+4)
- {
- /* Block dimensions of A = block dimensions of C */
- float4 tempA0 = blockA[(j >> 2) + get_local_id(1) * TILEY * get_local_size(0)];
- float4 tempA1 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 1) * get_local_size(0)];
- float4 tempA2 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 2) * get_local_size(0)];
- float4 tempA3 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 3) * get_local_size(0)];
-
- /* Block dimensions of B = block dimensions of C */
- float4 tempB0 = blockB[get_local_id(0) + j * get_local_size(0)]; //Should be localId.x * (TILEX / 4)
- float4 tempB1 = blockB[get_local_id(0) + (j + 1) * get_local_size(0)];
- float4 tempB2 = blockB[get_local_id(0) + (j + 2) * get_local_size(0)];
- float4 tempB3 = blockB[get_local_id(0) + (j + 3) * get_local_size(0)];
-
- sum0.x += tempA0.x * tempB0.x + tempA0.y * tempB1.x + tempA0.z * tempB2.x + tempA0.w * tempB3.x;
- sum0.y += tempA0.x * tempB0.y + tempA0.y * tempB1.y + tempA0.z * tempB2.y + tempA0.w * tempB3.y;
- sum0.z += tempA0.x * tempB0.z + tempA0.y * tempB1.z + tempA0.z * tempB2.z + tempA0.w * tempB3.z;
- sum0.w += tempA0.x * tempB0.w + tempA0.y * tempB1.w + tempA0.z * tempB2.w + tempA0.w * tempB3.w;
-
- sum1.x += tempA1.x * tempB0.x + tempA1.y * tempB1.x + tempA1.z * tempB2.x + tempA1.w * tempB3.x;
- sum1.y += tempA1.x * tempB0.y + tempA1.y * tempB1.y + tempA1.z * tempB2.y + tempA1.w * tempB3.y;
- sum1.z += tempA1.x * tempB0.z + tempA1.y * tempB1.z + tempA1.z * tempB2.z + tempA1.w * tempB3.z;
- sum1.w += tempA1.x * tempB0.w + tempA1.y * tempB1.w + tempA1.z * tempB2.w + tempA1.w * tempB3.w;
-
- sum2.x += tempA2.x * tempB0.x + tempA2.y * tempB1.x + tempA2.z * tempB2.x + tempA2.w * tempB3.x;
- sum2.y += tempA2.x * tempB0.y + tempA2.y * tempB1.y + tempA2.z * tempB2.y + tempA2.w * tempB3.y;
- sum2.z += tempA2.x * tempB0.z + tempA2.y * tempB1.z + tempA2.z * tempB2.z + tempA2.w * tempB3.z;
- sum2.w += tempA2.x * tempB0.w + tempA2.y * tempB1.w + tempA2.z * tempB2.w + tempA2.w * tempB3.w;
-
- sum3.x += tempA3.x * tempB0.x + tempA3.y * tempB1.x + tempA3.z * tempB2.x + tempA3.w * tempB3.x;
- sum3.y += tempA3.x * tempB0.y + tempA3.y * tempB1.y + tempA3.z * tempB2.y + tempA3.w * tempB3.y;
- sum3.z += tempA3.x * tempB0.z + tempA3.y * tempB1.z + tempA3.z * tempB2.z + tempA3.w * tempB3.z;
- sum3.w += tempA3.x * tempB0.w + tempA3.y * tempB1.w + tempA3.z * tempB2.w + tempA3.w * tempB3.w;
- }
- }
- matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0)] = sum0;
- matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + get_global_size(0)] = sum1;
- matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 2 * get_global_size(0)] = sum2;
- matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 3 * get_global_size(0)] = sum3;
-}
+++ /dev/null
-/*
- * Implemented Gaussian Random Number Generator. SIMD-oriented Fast Mersenne
- * Twister(SFMT) used to generate random numbers and Box mullar transformation used
- * to convert them to Gaussian random numbers.
- *
- * The SFMT algorithm details could be found at
- * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/index.html
- *
- * Box-Muller Transformation
- * http://mathworld.wolfram.com/Box-MullerTransformation.html
- *
- * One invocation of this kernel(gaussianRand), i.e one work thread writes
- * mulFactor output values.
- */
-
-/**
- * @brief Left shift
- * @param input input to be shifted
- * @param shift shifting count
- * @param output result after shifting input
- */
-void
-lshift128(uint4 input, uint shift, uint4* output)
-{
- unsigned int invshift = 32u - shift;
-
- uint4 temp;
- temp.x = input.x << shift;
- temp.y = (input.y << shift) | (input.x >> invshift);
- temp.z = (input.z << shift) | (input.y >> invshift);
- temp.w = (input.w << shift) | (input.z >> invshift);
-
- *output = temp;
-}
-
-/**
- * @brief Right shift
- * @param input input to be shifted
- * @param shift shifting count
- * @param output result after shifting input
- */
-void
-rshift128(uint4 input, uint shift, uint4* output)
-{
- unsigned int invshift = 32u - shift;
-
- uint4 temp;
-
- temp.w = input.w >> shift;
- temp.z = (input.z >> shift) | (input.w << invshift);
- temp.y = (input.y >> shift) | (input.z << invshift);
- temp.x = (input.x >> shift) | (input.y << invshift);
-
- *output = temp;
-}
-
-/**
- * @brief Generates gaussian random numbers by using
- * Mersenenne Twister algo and box muller transformation
- * @param seedArray array of seeds
- * @param width width of 2D seedArray
- * @param mulFactor No of generated random numbers for each seed
- * @param randArray array of random number from given seeds
- */
-__kernel
-void gaussianRand(const __global uint4 *seedArray,
- uint width,
- uint mulFactor,
- __global float4 *gaussianRand)
-{
-
- uint4 temp[8];
-
- size_t xPid = get_global_id(0);
- size_t yPid = get_global_id(1);
-
- uint4 state1 = seedArray[yPid *width + xPid];
- uint4 state2 = (uint4)(0);
- uint4 state3 = (uint4)(0);
- uint4 state4 = (uint4)(0);
- uint4 state5 = (uint4)(0);
-
- uint stateMask = 1812433253u;
- uint thirty = 30u;
- uint4 mask4 = (uint4)(stateMask);
- uint4 thirty4 = (uint4)(thirty);
- uint4 one4 = (uint4)(1u);
- uint4 two4 = (uint4)(2u);
- uint4 three4 = (uint4)(3u);
- uint4 four4 = (uint4)(4u);
-
- uint4 r1 = (uint4)(0);
- uint4 r2 = (uint4)(0);
-
- uint4 a = (uint4)(0);
- uint4 b = (uint4)(0);
-
- uint4 e = (uint4)(0);
- uint4 f = (uint4)(0);
-
- unsigned int thirteen = 13u;
- unsigned int fifteen = 15u;
- unsigned int shift = 8u * 3u;
-
- unsigned int mask11 = 0xfdff37ffu;
- unsigned int mask12 = 0xef7f3f7du;
- unsigned int mask13 = 0xff777b7du;
- unsigned int mask14 = 0x7ff7fb2fu;
-
- size_t actualPos = (size_t)0;
-
-
- const float one = 1.0f;
- const float intMax = 4294967296.0f;
- const float PI = 3.14159265358979f;
- const float two = 2.0f;
-
- float4 r;
- float4 phi;
-
- float4 temp1;
- float4 temp2;
-
- //Initializing states.
- state2 = mask4 * (state1 ^ (state1 >> thirty4)) + one4;
- state3 = mask4 * (state2 ^ (state2 >> thirty4)) + two4;
- state4 = mask4 * (state3 ^ (state3 >> thirty4)) + three4;
- state5 = mask4 * (state4 ^ (state4 >> thirty4)) + four4;
-
- uint i = 0;
- for(i = 0; i < mulFactor; ++i)
- {
- switch(i)
- {
- case 0:
- r1 = state4;
- r2 = state5;
- a = state1;
- b = state3;
- break;
- case 1:
- r1 = r2;
- r2 = temp[0];
- a = state2;
- b = state4;
- break;
- case 2:
- r1 = r2;
- r2 = temp[1];
- a = state3;
- b = state5;
- break;
- case 3:
- r1 = r2;
- r2 = temp[2];
- a = state4;
- b = state1;
- break;
- case 4:
- r1 = r2;
- r2 = temp[3];
- a = state5;
- b = state2;
- break;
- case 5:
- r1 = r2;
- r2 = temp[4];
- a = temp[0];
- b = temp[2];
- break;
- case 6:
- r1 = r2;
- r2 = temp[5];
- a = temp[1];
- b = temp[3];
- break;
- case 7:
- r1 = r2;
- r2 = temp[6];
- a = temp[2];
- b = temp[4];
- break;
- default:
- break;
-
- }
-
- lshift128(a, shift, &e);
- rshift128(r1, shift, &f);
-
- temp[i].x = a.x ^ e.x ^ ((b.x >> thirteen) & mask11) ^ f.x ^ (r2.x << fifteen);
- temp[i].y = a.y ^ e.y ^ ((b.y >> thirteen) & mask12) ^ f.y ^ (r2.y << fifteen);
- temp[i].z = a.z ^ e.z ^ ((b.z >> thirteen) & mask13) ^ f.z ^ (r2.z << fifteen);
- temp[i].w = a.w ^ e.w ^ ((b.w >> thirteen) & mask14) ^ f.w ^ (r2.w << fifteen);
- }
-
- actualPos = (yPid * width + xPid) * mulFactor;
-
- for(i = 0; i < mulFactor / 2; ++i)
- {
- temp1 = convert_float4(temp[i]) * one / intMax;
- temp2 = convert_float4(temp[i + 1]) * one / intMax;
-
- // Asinf Box Mullar Transformations.
- r = sqrt((-two) * log(temp1));
- phi = two * PI * temp2;
- gaussianRand[actualPos + i * 2 + 0] = r * cos(phi);
- gaussianRand[actualPos + i * 2 + 1] = r * sin(phi);
- }
-}
+++ /dev/null
-/*
- * For a description of the algorithm and the terms used, please see the
- * documentation for this sample.
- *
- * One invocation of calPriceVega kernel, i.e one work thread caluculates the
- * price value and path derivative from given initial price, strike price,
- * interest rate and maturity
- */
-
- typedef struct _MonteCalroAttrib
- {
- float4 strikePrice;
- float4 c1;
- float4 c2;
- float4 c3;
- float4 initPrice;
- float4 sigma;
- float4 timeStep;
- }MonteCarloAttrib;
-
-
-
-/**
- * @brief Left shift
- * @param input input to be shifted
- * @param shift shifting count
- * @param output result after shifting input
- */
-void
-lshift128(uint4 input, uint shift, uint4* output)
-{
- unsigned int invshift = 32u - shift;
-
- uint4 temp;
- temp.x = input.x << shift;
- temp.y = (input.y << shift) | (input.x >> invshift);
- temp.z = (input.z << shift) | (input.y >> invshift);
- temp.w = (input.w << shift) | (input.z >> invshift);
-
- *output = temp;
-}
-
-/**
- * @brief Right shift
- * @param input input to be shifted
- * @param shift shifting count
- * @param output result after shifting input
- */
-void
-rshift128(uint4 input, uint shift, uint4* output)
-{
- unsigned int invshift = 32u - shift;
-
- uint4 temp;
-
- temp.w = input.w >> shift;
- temp.z = (input.z >> shift) | (input.w << invshift);
- temp.y = (input.y >> shift) | (input.z << invshift);
- temp.x = (input.x >> shift) | (input.y << invshift);
-
- *output = temp;
-}
-
-/**
- * @brief Generates gaussian random numbers by using
- * Mersenenne Twister algo and box muller transformation
- * @param seedArray seed
- * @param gaussianRand1 gaussian random number generatred
- * @param gaussianRand2 gaussian random number generarted
- * @param nextRand generated seed for next usage
- */
-void generateRand(uint4 seed,
- float4 *gaussianRand1,
- float4 *gaussianRand2,
- uint4 *nextRand)
-{
-
- uint mulFactor = 4;
- uint4 temp[8];
-
- uint4 state1 = seed;
- uint4 state2 = (uint4)(0);
- uint4 state3 = (uint4)(0);
- uint4 state4 = (uint4)(0);
- uint4 state5 = (uint4)(0);
-
- uint stateMask = 1812433253u;
- uint thirty = 30u;
- uint4 mask4 = (uint4)(stateMask);
- uint4 thirty4 = (uint4)(thirty);
- uint4 one4 = (uint4)(1u);
- uint4 two4 = (uint4)(2u);
- uint4 three4 = (uint4)(3u);
- uint4 four4 = (uint4)(4u);
-
- uint4 r1 = (uint4)(0);
- uint4 r2 = (uint4)(0);
-
- uint4 a = (uint4)(0);
- uint4 b = (uint4)(0);
-
- uint4 e = (uint4)(0);
- uint4 f = (uint4)(0);
-
- unsigned int thirteen = 13u;
- unsigned int fifteen = 15u;
- unsigned int shift = 8u * 3u;
-
- unsigned int mask11 = 0xfdff37ffu;
- unsigned int mask12 = 0xef7f3f7du;
- unsigned int mask13 = 0xff777b7du;
- unsigned int mask14 = 0x7ff7fb2fu;
-
-
- const float one = 1.0f;
- const float intMax = 4294967296.0f;
- const float PI = 3.14159265358979f;
- const float two = 2.0f;
-
- float4 r;
- float4 phi;
-
- float4 temp1;
- float4 temp2;
-
- //Initializing states.
- state2 = mask4 * (state1 ^ (state1 >> thirty4)) + one4;
- state3 = mask4 * (state2 ^ (state2 >> thirty4)) + two4;
- state4 = mask4 * (state3 ^ (state3 >> thirty4)) + three4;
- state5 = mask4 * (state4 ^ (state4 >> thirty4)) + four4;
-
- uint i = 0;
- for(i = 0; i < mulFactor; ++i)
- {
- switch(i)
- {
- case 0:
- r1 = state4;
- r2 = state5;
- a = state1;
- b = state3;
- break;
- case 1:
- r1 = r2;
- r2 = temp[0];
- a = state2;
- b = state4;
- break;
- case 2:
- r1 = r2;
- r2 = temp[1];
- a = state3;
- b = state5;
- break;
- case 3:
- r1 = r2;
- r2 = temp[2];
- a = state4;
- b = state1;
- break;
- default:
- break;
-
- }
-
- lshift128(a, shift, &e);
- rshift128(r1, shift, &f);
-
- temp[i].x = a.x ^ e.x ^ ((b.x >> thirteen) & mask11) ^ f.x ^ (r2.x << fifteen);
- temp[i].y = a.y ^ e.y ^ ((b.y >> thirteen) & mask12) ^ f.y ^ (r2.y << fifteen);
- temp[i].z = a.z ^ e.z ^ ((b.z >> thirteen) & mask13) ^ f.z ^ (r2.z << fifteen);
- temp[i].w = a.w ^ e.w ^ ((b.w >> thirteen) & mask14) ^ f.w ^ (r2.w << fifteen);
- }
-
- temp1 = convert_float4(temp[0]) * one / intMax;
- temp2 = convert_float4(temp[1]) * one / intMax;
-
- // Applying Box Mullar Transformations.
- r = sqrt((-two) * log(temp1));
- phi = two * PI * temp2;
- *gaussianRand1 = r * cos(phi);
- *gaussianRand2 = r * sin(phi);
- *nextRand = temp[2];
-
-}
-
-/**
- * @brief calculates the price and vega for all trajectories
- */
-void
-calOutputs(float4 strikePrice,
- float4 meanDeriv1,
- float4 meanDeriv2,
- float4 meanPrice1,
- float4 meanPrice2,
- float4 *pathDeriv1,
- float4 *pathDeriv2,
- float4 *priceVec1,
- float4 *priceVec2)
-{
- float4 temp1 = (float4)0.0f;
- float4 temp2 = (float4)0.0f;
- float4 temp3 = (float4)0.0f;
- float4 temp4 = (float4)0.0f;
-
- float4 tempDiff1 = meanPrice1 - strikePrice;
- float4 tempDiff2 = meanPrice2 - strikePrice;
- if(tempDiff1.x > 0.0f)
- {
- temp1.x = 1.0f;
- temp3.x = tempDiff1.x;
- }
- if(tempDiff1.y > 0.0f)
- {
- temp1.y = 1.0f;
- temp3.y = tempDiff1.y ;
- }
- if(tempDiff1.z > 0.0f)
- {
- temp1.z = 1.0f;
- temp3.z = tempDiff1.z;
- }
- if(tempDiff1.w > 0.0f)
- {
- temp1.w = 1.0f;
- temp3.w = tempDiff1.w;
- }
-
- if(tempDiff2.x > 0.0f)
- {
- temp2.x = 1.0f;
- temp4.x = tempDiff2.x;
- }
- if(tempDiff2.y > 0.0f)
- {
- temp2.y = 1.0f;
- temp4.y = tempDiff2.y;
- }
- if(tempDiff2.z > 0.0f)
- {
- temp2.z = 1.0f;
- temp4.z = tempDiff2.z;
- }
- if(tempDiff2.w > 0.0f)
- {
- temp2.w = 1.0f;
- temp4.w = tempDiff2.w;
- }
-
- *pathDeriv1 = meanDeriv1 * temp1;
- *pathDeriv2 = meanDeriv2 * temp2;
- *priceVec1 = temp3;
- *priceVec2 = temp4;
-}
-
-/**
- * @brief Calculates the price and vega for all trajectories for given random numbers
- * @param attrib structure of inputs for simulation
- * @param width width of random array
- * @param priceSamples array of calculated price samples
- * @param pathDeriv array calculated path derivatives
- */
-__kernel
-void
-calPriceVega(MonteCarloAttrib attrib,
- int noOfSum,
- int width,
- __global uint4 *randArray,
- __global float4 *priceSamples,
- __global float4 *pathDeriv)
-{
-
-
- float4 strikePrice = attrib.strikePrice;
- float4 c1 = attrib.c1;
- float4 c2 = attrib.c2;
- float4 c3 = attrib.c3;
- float4 initPrice = attrib.initPrice;
- float4 sigma = attrib.sigma;
- float4 timeStep = attrib.timeStep;
-
- size_t xPos = get_global_id(0);
- size_t yPos = get_global_id(1);
-
- float4 temp = (float4)0.0f;
-
- float4 price1 = (float4)0.0f;
- float4 price2 = (float4)0.0f;
- float4 pathDeriv1 = (float4)0.0f;
- float4 pathDeriv2 = (float4)0.0f;
-
- float4 trajPrice1 = initPrice;
- float4 trajPrice2 = initPrice;
-
- float4 sumPrice1 = initPrice;
- float4 sumPrice2 = initPrice;
-
- float4 meanPrice1 = temp;
- float4 meanPrice2 = temp;
-
- float4 sumDeriv1 = temp;
- float4 sumDeriv2 = temp;
-
- float4 meanDeriv1 = temp;
- float4 meanDeriv2 = temp;
-
- float4 finalRandf1 = temp;
- float4 finalRandf2 = temp;
-
- uint4 nextRand = randArray[yPos * width + xPos];
-
- //Run the Monte Carlo simulation a total of Num_Sum - 1 times
- for(int i = 1; i < noOfSum; i++)
- {
- uint4 tempRand = nextRand;
- generateRand(tempRand, &finalRandf1, &finalRandf2, &nextRand);
-
- //Calculate the trajectory price and sum price for all trajectories
- trajPrice1 = trajPrice1 * exp(c1 + c2 * finalRandf1);
- trajPrice2 = trajPrice2 * exp(c1 + c2 * finalRandf2);
-
- sumPrice1 = sumPrice1 + trajPrice1;
- sumPrice2 = sumPrice2 + trajPrice2;
-
- temp = c3 * timeStep * i;
-
- // Calculate the derivative price for all trajectories
- sumDeriv1 = sumDeriv1 + trajPrice1
- * ((log(trajPrice1 / initPrice) - temp) / sigma);
-
- sumDeriv2 = sumDeriv2 + trajPrice2
- * ((log(trajPrice2 / initPrice) - temp) / sigma);
-
- }
-
- //Calculate the average price and \93average derivative\94 of each simulated path
- meanPrice1 = sumPrice1 / noOfSum;
- meanPrice2 = sumPrice2 / noOfSum;
- meanDeriv1 = sumDeriv1 / noOfSum;
- meanDeriv2 = sumDeriv2 / noOfSum;
-
- calOutputs(strikePrice, meanDeriv1, meanDeriv2, meanPrice1,
- meanPrice2, &pathDeriv1, &pathDeriv2, &price1, &price2);
-
- priceSamples[(yPos * width + xPos) * 2] = price1;
- priceSamples[(yPos * width + xPos) * 2 + 1] = price2;
- pathDeriv[(yPos * width + xPos) * 2] = pathDeriv1;
- pathDeriv[(yPos * width + xPos) * 2 + 1] = pathDeriv2;
-
-}
-
+++ /dev/null
-/*
- * For a description of the algorithm and the terms used, please see the
- * documentation for this sample.
- *
- * Each work-item invocation of this kernel, calculates the position for
- * one particle
- *
- * Work-items use local memory to reduce memory bandwidth and reuse of data
- */
-
-
-
-__kernel
-
-void
-
-nbody_sim(
-
- __global float4* pos ,
-
- __global float4* vel,
-
- int numBodies,
-
- float deltaTime,
-
- float epsSqr,
-
- __local float4* localPos,
- __global float4* newPosition,
- __global float4* newVelocity)
-
-{
- unsigned int tid = get_local_id(0);
-
- unsigned int gid = get_global_id(0);
-
- unsigned int localSize = get_local_size(0);
-
-
-
- // Number of tiles we need to iterate
-
- unsigned int numTiles = numBodies / localSize;
-
-
-
- // position of this work-item
-
- float4 myPos = pos[gid];
-
- float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
-
-
-
- for(int i = 0; i < numTiles; ++i)
-
- {
-
- // load one tile into local memory
-
- int idx = i * localSize + tid;
-
- localPos[tid] = pos[idx];
-
-
-
- // Synchronize to make sure data is available for processing
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
-
-
- // calculate acceleration effect due to each body
-
- // a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)
-
- //for(int j = 0; j < localSize; ++j)
- int j = 0;
- {
-
- // Calculate acceleartion caused by particle j on particle i
-
- float4 r = localPos[j] - myPos;
-
- float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
-
- float invDist = 1.0f / sqrt(distSqr + epsSqr);
-
- float invDistCube = invDist * invDist * invDist;
-
- float s = localPos[j].w * invDistCube;
-
-
-
- // accumulate effect of all particles
-
- acc += s * r;
-
- }
-
-
-
- // Synchronize so that next tile can be loaded
-
- //barrier(CLK_LOCAL_MEM_FENCE);
-
- }
-
-
-
- float4 oldVel = vel[gid];
-
-
-
- // updated position and velocity
-
- float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime;
-
- newPos.w = myPos.w;
-
-
-
- float4 newVel = oldVel + acc * deltaTime;
-
-
-
- // write to global memory
-
- newPosition[gid] = newPos;
-
- newVelocity[gid] = newVel;
-}
-
+++ /dev/null
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia@intel.com>
- */
-
-#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
-__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##0(void); \
-__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##1(void); \
-__attribute__((pure,const)) unsigned int __gen_ocl_##NAME##2(void);
-DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
-DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
-DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
-DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
-DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
-#undef DECL_INTERNAL_WORK_ITEM_FN
-
-#define DECL_PUBLIC_WORK_ITEM_FN(NAME) \
-inline unsigned NAME(unsigned int dim) { \
- if (dim == 0) return __gen_ocl_##NAME##0(); \
- else if (dim == 1) return __gen_ocl_##NAME##1(); \
- else if (dim == 2) return __gen_ocl_##NAME##2(); \
- else return 0; \
-}
-DECL_PUBLIC_WORK_ITEM_FN(get_group_id)
-DECL_PUBLIC_WORK_ITEM_FN(get_local_id)
-DECL_PUBLIC_WORK_ITEM_FN(get_local_size)
-DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
-DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
-#undef DECL_PUBLIC_WORK_ITEM_FN
-
-inline unsigned int get_global_id(unsigned int dim) {
- return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
-}
-
-__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c);
-__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) {
- return cond ? src0 : src1;
-}
-__attribute__((overloadable)) inline int select(int src0, int src1, int cond) {
- return cond ? src0 : src1;
-}
-
-typedef float float2 __attribute__((ext_vector_type(2)));
-typedef float float3 __attribute__((ext_vector_type(3)));
-typedef float float4 __attribute__((ext_vector_type(4)));
-typedef int int2 __attribute__((ext_vector_type(2)));
-typedef int int3 __attribute__((ext_vector_type(3)));
-typedef int int4 __attribute__((ext_vector_type(4)));
-typedef int uint2 __attribute__((ext_vector_type(2)));
-typedef unsigned uint3 __attribute__((ext_vector_type(3)));
-typedef unsigned uint4 __attribute__((ext_vector_type(4)));
-typedef bool bool2 __attribute__((ext_vector_type(2)));
-typedef bool bool3 __attribute__((ext_vector_type(3)));
-typedef bool bool4 __attribute__((ext_vector_type(4)));
-
-// This will be optimized out by LLVM and will output LLVM select instructions
-#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
-__attribute__((overloadable)) \
-inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
- TYPE4 dst; \
- const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
- const TYPE x1 = src1.x; \
- const TYPE y0 = src0.y; \
- const TYPE y1 = src1.y; \
- const TYPE z0 = src0.z; \
- const TYPE z1 = src1.z; \
- const TYPE w0 = src0.w; \
- const TYPE w1 = src1.w; \
- \
- dst.x = (cond.x & MASK) ? x1 : x0; \
- dst.y = (cond.y & MASK) ? y1 : y0; \
- dst.z = (cond.z & MASK) ? z1 : z0; \
- dst.w = (cond.w & MASK) ? w1 : w0; \
- return dst; \
-}
-DECL_SELECT4(int4, int, int4, 0x80000000)
-DECL_SELECT4(float4, float, int4, 0x80000000)
-#undef DECL_SELECT4
-
-__attribute__((overloadable,always_inline)) inline float2 mad(float2 a, float2 b, float2 c) {
- return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));
-}
-__attribute__((overloadable,always_inline)) inline float3 mad(float3 a, float3 b, float3 c) {
- return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));
-}
-__attribute__((overloadable,always_inline)) inline float4 mad(float4 a, float4 b, float4 c) {
- return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),
- mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));
-}
-
-#define __private __attribute__((address_space(0)))
-#define __global __attribute__((address_space(1)))
-#define __constant __attribute__((address_space(2)))
-//#define __local __attribute__((address_space(3)))
-#define global __global
-//#define local __local
-#define constant __constant
-#define private __private
-
-#define NULL ((void*)0)
+++ /dev/null
-__kernel
-void test1(__global char* svm,
- uint svmBase,
- uint context)
-{
- int i = get_global_id(0);
- svm -= svmBase;
- __global int *ptr = (__global int *)&svm[context];
- ptr[i]=i;
-}
-
+++ /dev/null
-__kernel void
-test_2d_copy(__global int* dst0,
- __global int* dst1,
- __global int* src,
- int w)
-{
- const int x = (int)get_global_id(0);
- const int y = (int)get_global_id(1);
- const int index = x + y * w;
- dst0[index] = src[index];
- dst1[index] = x + y;
-}
-
+++ /dev/null
-__kernel void test_barrier ( __global int *dst,
- const float a0,
- const char a1,
- const int a2,
- __global int *src,
- const short a3,
- const uint a4,
- const int a5)
-{
- int tid = get_global_id(0);
- dst[tid] = (int)a0 + (int)a1 + (int)a2 + (int)a3 + (int)a4 + (int)a5 + src[tid];
- barrier(CLK_LOCAL_MEM_FENCE);
-}
-
+++ /dev/null
-__constant int hop[] = {8};
-
-__kernel void test_constant_memory(__global int *dst,
- __local int *local0,
- __constant int *constants)
-{
- __local int local1[32];
- int id = get_local_id(0);
- int tid = get_global_id(0);
- local0[id] = local1[id] = id;
- barrier(CLK_LOCAL_MEM_FENCE);
- dst[tid] = local0[32-id-1] + local1[32-id-1] + constants[id] + hop[0];
-}
-
+++ /dev/null
-__constant sampler_t s = CLK_NORMALIZED_COORDS_FALSE |
- CLK_ADDRESS_CLAMP |
- CLK_FILTER_NEAREST;
-__kernel void
-test_copy_image(__read_only image2d_t src, __global uchar4 *dst)
-{
- const int x = (int) get_global_id(0);
- const int y = (int) get_global_id(1);
- const int id = x + y * get_image_width(src);
- const uchar4 from = convert_uchar4(read_imageui(src, s, (int2)(x,y)));
- dst[id] = from;
-}
-
+++ /dev/null
-__kernel void test_imm_parameters (__global int *dst,
- const float a0,
- const char a1,
- const int a2,
- __global int *src,
- const short a3,
- const uint a4,
- const int a5)
-{
- int tid = get_global_id(0);
- const int from = src[tid];
- dst[6*tid+0] = (int) a0 + from;
- dst[6*tid+1] = (int) a1 + from;
- dst[6*tid+2] = (int) a2 + from;
- dst[6*tid+3] = (int) a3 + from;
- dst[6*tid+4] = (int) a4 + from;
- dst[6*tid+5] = (int) a5 + from;
-}
-
+++ /dev/null
-__kernel void test_local_memory(__global int *dst, __local int *local0, __local int *local1)
-{
- int id = get_local_id(0);
- int tid = get_global_id(0);
- local0[id] = local1[id] = id;
- barrier(CLK_LOCAL_MEM_FENCE);
- dst[tid] = local0[32-id-1] + local1[32-id-1];
-}
-
+++ /dev/null
-__kernel void test_private_memory(__global int *dst)
-{
- int local0[32];
- int tid = get_global_id(0);
- int i, res = 0;
-
- for (i = 0; i < 32; ++i) {
- local0[i] = i + tid;
- }
- for (i = 0; i < 32; ++i) {
- res += local0[i];
- }
-
- dst[tid] = res;
-}
-
+++ /dev/null
-__kernel void test_static_local_memory(__global int *dst)
-{
- __local int local0[32], local1[32];
- int id = get_local_id(0);
- int tid = get_global_id(0);
- local0[id] = local1[id] = id;
- barrier(CLK_LOCAL_MEM_FENCE);
- dst[tid] = local0[32-id-1] + local1[32-id-1];
-}
-
+++ /dev/null
-__kernel void
-test_trigo(__global float *dst, __global const float *src)
-{
- const int offset = get_global_id(0);
- dst[offset] = sin(src[offset]);
-}
-
+++ /dev/null
-/*
- * Copies a block to the local memory
- * and copies back the transpose from local memory to output
- * @param output output matrix
- * @param input input matrix
- * @param block local memory of size blockSize x blockSize
- * @param width width of the input matrix
- * @param height height of the input matrix
- * @param blockSize size of the block
- */
-
-__kernel
-void matrixTranspose(__global float * output,
- __global float * input,
- __local float * block,
- const uint width,
- const uint height,
- const uint blockSize
- )
-{
- uint globalIdx = get_global_id(0);
- uint globalIdy = get_global_id(1);
-
- uint localIdx = get_local_id(0);
- uint localIdy = get_local_id(1);
-
- /* copy from input to local memory */
- block[localIdy*blockSize + localIdx] = input[globalIdy*width + globalIdx];
-
- /* wait until the whole block is filled */
- barrier(CLK_LOCAL_MEM_FENCE);
-
- uint groupIdx = get_group_id(0);
- uint groupIdy = get_group_id(1);
-
- /* calculate the corresponding target location for transpose by inverting x and y values*/
- uint targetGlobalIdx = groupIdy*blockSize + localIdy;
- uint targetGlobalIdy = groupIdx*blockSize + localIdx;
-
- /* calculate the corresponding raster indices of source and target */
- uint targetIndex = targetGlobalIdy*height + targetGlobalIdx;
- uint sourceIndex = localIdy * blockSize + localIdx;
-
- output[targetIndex] = block[sourceIndex];
-}
+++ /dev/null
-
-#define IA 16807 // a
-#define IM 2147483647 // m
-#define AM (1.0/IM) // 1/m - To calculate floating point result
-#define IQ 127773
-#define IR 2836
-#define NTAB 16
-#define NDIV (1 + (IM - 1)/ NTAB)
-#define EPS 1.2e-7
-#define RMAX (1.0 - EPS)
-#define GROUP_SIZE 64
-
-
-
-/* Generate uniform random deviation */
-/* Park-Miller with Bays-Durham shuffle and added safeguards
- Returns a uniform random deviate between (-FACTOR/2, FACTOR/2)
- input seed should be negative */
-float ran1(int idum, __local int *iv)
-{
- int j;
- int k;
- int iy = 0;
- int tid = get_local_id(0);
-
- for(j = NTAB; j >=0; j--) //Load the shuffle
- {
- k = idum / IQ;
- idum = IA * (idum - k * IQ) - IR * k;
-
- if(idum < 0)
- idum += IM;
-
- if(j < NTAB)
- iv[NTAB* tid + j] = idum;
- }
- iy = iv[NTAB* tid];
-
- k = idum / IQ;
- idum = IA * (idum - k * IQ) - IR * k;
-
- if(idum < 0)
- idum += IM;
-
- j = iy / NDIV;
- iy = iv[NTAB * tid + j];
- return (AM * iy); //AM *iy will be between 0.0 and 1.0
-}
-
-
-
-__kernel void noise_uniform(__global uchar4* inputImage, __global uchar4* outputImage, int factor)
-{
- int pos = get_global_id(0) + get_global_id(1) * get_global_size(0);
-
- float4 temp = convert_float4(inputImage[pos]);
-
- /* compute average value of a pixel from its compoments */
- float avg = (temp.x + temp.y + temp.z + temp.y) / 4;
-
- /* Each thread has NTAB private values */
- /* Local memory is used as indexed arrays use global memory instead of registers */
- __local int iv[NTAB * GROUP_SIZE];
-
- /* Calculate deviation from the avg value of a pixel */
- float dev = ran1(-avg, iv);
- dev = (dev - 0.55) * factor;
-
- /* Saturate(clamp) the values */
- outputImage[pos] = convert_uchar4_sat(temp + (float4)(dev));
-}
+++ /dev/null
-__kernel void vadd_gpu(
- __global const float* a,
- __global const float* b,
- __global float* c,
- int n)
-{
- const int i = get_global_id(0);
- c[i] = a[i] + b[i];
-}
-