MobinMQL/Scripts/Examples/OpenCL/Double/Kernels/matrixmult.cl
2025-07-22 14:48:34 +03:00

69 lines
2.7 KiB
Common Lisp

//--- by default some GPU doesn't support doubles
//--- cl_khr_fp64 directive is used to enable work with doubles
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
//+-----------------------------------------------------------+
//| OpenCL kernel for matrix multiplication |
//| using global work groups |
//+-----------------------------------------------------------+
//| http://gpgpu-computing4.blogspot.ru/2009/09/ |
//| /matrix-multiplication-2-opencl.html |
//+-----------------------------------------------------------+
__kernel void MatrixMult_GPU1(__global double *matrix_a,
__global double *matrix_b,
__global double *matrix_c,
int rows_a,int cols_a,int cols_b)
{
int i=get_global_id(0);
int j=get_global_id(1);
double sum=0.0;
for(int k=0; k<cols_a; k++)
{
sum+=matrix_a[cols_a*i+k]*matrix_b[cols_b*k+j];
}
matrix_c[cols_b*i+j]=sum;
}
#define BLOCK_SIZE 10
//+-----------------------------------------------------------+
//| OpenCL kernel for matrix multiplication |
//| using local groups with common local memory |
//+-----------------------------------------------------------+
//| http://gpgpu-computing4.blogspot.ru/2009/10/ |
//| /matrix-multiplication-3-opencl.html |
//+-----------------------------------------------------------+
__kernel void MatrixMult_GPU2(__global double *matrix_a,
__global double *matrix_b,
__global double *matrix_c,
int rows_a,int cols_a,int cols_b)
{
int group_i=get_group_id(0);
int group_j=get_group_id(1);
int i=get_local_id(0);
int j=get_local_id(1);
int offset_b=BLOCK_SIZE*group_i;
int offset_a_start=cols_a*BLOCK_SIZE*group_j;
double sum=(float)0.0;
for(int offset_a=offset_a_start;
offset_a<offset_a_start+cols_a;
offset_a+=BLOCK_SIZE,
offset_b+=BLOCK_SIZE*cols_b)
{
__local double submatrix_a[BLOCK_SIZE][BLOCK_SIZE];
__local double submatrix_b[BLOCK_SIZE][BLOCK_SIZE];
submatrix_a[i][j]=matrix_a[offset_a+cols_a*i+j];
submatrix_b[i][j]=matrix_b[offset_b+cols_b*i+j];
barrier(CLK_LOCAL_MEM_FENCE);
for(int k=0; k<BLOCK_SIZE; k++)
sum+=submatrix_a[i][k]*submatrix_b[k][j];
barrier(CLK_LOCAL_MEM_FENCE);
}
int offset_c=BLOCK_SIZE*(cols_b*group_j+group_i);
matrix_c[offset_c+cols_b*i+j]=sum;
};
//+------------------------------------------------------------------+