//--- by default some GPU doesn't support doubles //--- cl_khr_fp64 directive is used to enable work with doubles #pragma OPENCL EXTENSION cl_khr_fp64 : enable //+------------------------------------------------------------------+ //| fft_init OpenCL kernel for Fast Fourier Transfrom | //+------------------------------------------------------------------+ //| Matthew Scarpino, "OpenCL in Action: How to accelerate graphics | //| and computations", Manning, 2012, Chapter 14. | //+------------------------------------------------------------------+ __kernel void fft_init(__global double2 *in_data, __global double2 *out_data, __local double2 *l_data, uint points_per_group,uint size,int dir) { uint4 br,index; uint points_per_item,g_addr,l_addr,i,fft_index,stage,N2; double2 x1,x2,x3,x4,sum12,diff12,sum34,diff34; points_per_item=points_per_group/get_local_size(0); l_addr = get_local_id(0)*points_per_item; g_addr = get_group_id(0)*points_per_group + l_addr; //--- load data from bit-reversed addresses and perform 4-point FFTs for(i=0; i> N2) & stage; //--- bit-reverse addresses while(N2>1) { N2-=2; fft_index>>=1; stage<<=1; br |= (index << N2) & fft_index; br |= (index >> N2) & stage; } //--- load global data x1 = in_data[br.s0]; x2 = in_data[br.s1]; x3 = in_data[br.s2]; x4 = in_data[br.s3]; sum12=x1+x2; diff12= x1-x2; sum34 = x3+x4; diff34=(double2)(x3.s1-x4.s1,x4.s0-x3.s0)*dir; l_data[l_addr]=sum12+sum34; l_data[l_addr+1] = diff12 + diff34; l_data[l_addr+2] = sum12 - sum34; l_data[l_addr+3] = diff12 - diff34; l_addr += 4; g_addr += 4; } //--- perform initial stages of the FFT - each of length N2*2 for(N2=4; N2