Warning, /kdevelop/kdevelop/plugins/clang/tests/files/cuda.cu is written in an unsupported language. File is not indexed.
0001 // Thread block size 0002 #define BLOCK_SIZE 16 0003 0004 // Matrices are stored in row-major order: 0005 // M(row, col) = *(M.elements + row * M.stride + col) 0006 typedef struct { 0007 int width; 0008 int height; 0009 int stride; 0010 float* elements; 0011 } Matrix; 0012 0013 // Get a matrix element 0014 __device__ float GetElement(const Matrix A, int row, int col) 0015 { 0016 return A.elements[row * A.stride + col]; 0017 } 0018 0019 // Set a matrix element 0020 __device__ void SetElement(Matrix A, int row, int col, 0021 float value) 0022 { 0023 A.elements[row * A.stride + col] = value; 0024 } 0025 0026 // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is 0027 // located col sub-matrices to the right and row sub-matrices down 0028 // from the upper-left corner of A 0029 __device__ Matrix GetSubMatrix(Matrix A, int row, int col) 0030 { 0031 Matrix Asub; 0032 Asub.width = BLOCK_SIZE; 0033 Asub.height = BLOCK_SIZE; 0034 Asub.stride = A.stride; 0035 Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row 0036 + BLOCK_SIZE * col]; 0037 return Asub; 0038 } 0039 0040 0041 __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) 0042 { 0043 // Block row and column 0044 int blockRow = blockIdx.y; 0045 int blockCol = blockIdx.x; 0046 0047 // Each thread block computes one sub-matrix Csub of C 0048 Matrix Csub = GetSubMatrix(C, blockRow, blockCol); 0049 0050 // Each thread computes one element of Csub 0051 // by accumulating results into Cvalue 0052 float Cvalue = 0; 0053 0054 // Thread row and column within Csub 0055 int row = threadIdx.y; 0056 int col = threadIdx.x; 0057 0058 // Loop over all the sub-matrices of A and B that are 0059 // required to compute Csub 0060 // Multiply each pair of sub-matrices together 0061 // and accumulate the results 0062 for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) { 0063 0064 // Get sub-matrix Asub of A 0065 Matrix Asub = GetSubMatrix(A, blockRow, m); 0066 0067 // Get sub-matrix Bsub of B 0068 Matrix Bsub = GetSubMatrix(B, m, blockCol); 0069 0070 // Shared memory used to store Asub and Bsub respectively 0071 __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 0072 __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 0073 0074 // Load Asub and Bsub from device memory to shared memory 0075 // Each thread loads one element of each sub-matrix 0076 As[row][col] = GetElement(Asub, row, col); 0077 Bs[row][col] = GetElement(Bsub, row, col); 0078 0079 // Synchronize to make sure the sub-matrices are loaded 0080 // before starting the computation 0081 __syncthreads(); 0082 0083 // Multiply Asub and Bsub together 0084 for (int e = 0; e < BLOCK_SIZE; ++e) 0085 Cvalue += As[row][e] * Bs[e][col]; 0086 0087 // Synchronize to make sure that the preceding 0088 // computation is done before loading two new 0089 // sub-matrices of A and B in the next iteration 0090 __syncthreads(); 0091 } 0092 0093 // Write Csub to device memory 0094 // Each thread writes one element 0095 SetElement(Csub, row, col, Cvalue); 0096 } 0097 0098 // Matrix multiplication - Host code 0099 // Matrix dimensions are assumed to be multiples of BLOCK_SIZE 0100 void MatMul(const Matrix A, const Matrix B, Matrix C) 0101 { 0102 // Load A and B to device memory 0103 Matrix d_A; 0104 d_A.width = d_A.stride = A.width; d_A.height = A.height; 0105 size_t size = A.width * A.height * sizeof(float); 0106 cudaMalloc(&d_A.elements, size); 0107 cudaMemcpy(d_A.elements, A.elements, size, 0108 cudaMemcpyHostToDevice); 0109 Matrix d_B; 0110 d_B.width = d_B.stride = B.width; d_B.height = B.height; 0111 size = B.width * B.height * sizeof(float); 0112 0113 cudaMalloc(&d_B.elements, size); 0114 cudaMemcpy(d_B.elements, B.elements, size, 0115 cudaMemcpyHostToDevice); 0116 0117 // Allocate C in device memory 0118 Matrix d_C; 0119 d_C.width = d_C.stride = C.width; d_C.height = C.height; 0120 size = C.width * C.height * sizeof(float); 0121 cudaMalloc(&d_C.elements, size); 0122 0123 // Invoke kernel 0124 dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 0125 dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y); 0126 MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); 0127 0128 // Read C from device memory 0129 cudaMemcpy(C.elements, d_C.elements, size, 0130 cudaMemcpyDeviceToHost); 0131 0132 // Free device memory 0133 cudaFree(d_A.elements); 0134 cudaFree(d_B.elements); 0135 cudaFree(d_C.elements); 0136 } 0137 0138 // Matrix multiplication kernel called by MatMul() 0139 0140 int main() 0141 { 0142 Matrix a, b, c; 0143 MatMul(a, b, c); 0144 }