C Online Compiler
Example: CUDA Matrix Multiplication in C
C
C++
C#
Java
Python
PHP
main.c
STDIN
Run
// CUDA Matrix Multiplication #include <stdio.h> #include <stdlib.h> // For malloc and free #include <cuda_runtime.h> // For CUDA API functions // Define matrix dimensions #define M 256 #define K 256 #define N 256 // CUDA error check macro #define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) void check(cudaError_t err, const char* const func, const char* const file, const int line) { if (err != cudaSuccess) { fprintf(stderr, "CUDA Error at %s:%d: %s %s\n", file, line, func, cudaGetErrorString(err)); exit(EXIT_FAILURE); } } // CUDA kernel to perform matrix multiplication __global__ void matrixMulKernel(float* A, float* B, float* C, int widthA, int heightA, int widthB, int heightB) { // Calculate the row and column of the C element to be computed by this thread int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; // Check if the thread is within the bounds of the output matrix C if (row < heightA && col < widthB) { float sum = 0.0f; for (int i = 0; i < widthA; ++i) { // widthA == heightB (K dimension) sum += A[row * widthA + i] * B[i * widthB + col]; } C[row * widthB + col] = sum; } } // Function to initialize matrices void initializeMatrix(float* matrix, int rows, int cols) { for (int i = 0; i < rows * cols; ++i) { matrix[i] = rand() / (float)RAND_MAX; // Random float between 0 and 1 } } // Function to print a matrix (for small matrices) void printMatrix(float* matrix, int rows, int cols) { if (rows > 10 || cols > 10) { // Limit printing for large matrices printf("Matrix too large to print.\n"); return; } for (int i = 0; i < rows; ++i) { for (int j = 0; j < cols; ++j) { printf("%.2f ", matrix[i * cols + j]); } printf("\n"); } } int main() { // Step 1: Allocate memory on host (CPU) float *h_A, *h_B, *h_C; size_t sizeA = M * K * sizeof(float); size_t sizeB = K * N * sizeof(float); size_t sizeC = M * N * sizeof(float); h_A = (float*)malloc(sizeA); h_B = (float*)malloc(sizeB); h_C = (float*)malloc(sizeC); if (h_A == NULL || h_B == NULL || h_C == NULL) { fprintf(stderr, "Failed to allocate host memory!\n"); exit(EXIT_FAILURE); } // Step 2: Initialize host matrices printf("Initializing host matrices...\n"); initializeMatrix(h_A, M, K); initializeMatrix(h_B, K, N); // Optional: Print small matrices printf("Matrix A (M x K):\n"); printMatrix(h_A, M, K); printf("Matrix B (K x N):\n"); printMatrix(h_B, K, N); // Step 3: Allocate memory on device (GPU) float *d_A, *d_B, *d_C; CHECK_CUDA_ERROR(cudaMalloc((void**)&d_A, sizeA)); CHECK_CUDA_ERROR(cudaMalloc((void**)&d_B, sizeB)); CHECK_CUDA_ERROR(cudaMalloc((void**)&d_C, sizeC)); // Step 4: Copy input matrices from host to device printf("Copying data from host to device...\n"); CHECK_CUDA_ERROR(cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice)); CHECK_CUDA_ERROR(cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice)); // Step 5: Define grid and block dimensions for the kernel launch // A common choice is 16x16 or 32x32 threads per block int THREADS_PER_BLOCK_X = 16; int THREADS_PER_BLOCK_Y = 16; dim3 dimBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y); dim3 dimGrid( (N + dimBlock.x - 1) / dimBlock.x, // Grid width (cols of C) (M + dimBlock.y - 1) / dimBlock.y // Grid height (rows of C) ); // Step 6: Launch the CUDA kernel printf("Launching CUDA kernel...\n"); matrixMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, K, M, N, K); // K is widthA and heightB CHECK_CUDA_ERROR(cudaGetLastError()); // Check for errors during kernel launch // Step 7: Copy the result matrix from device to host printf("Copying result from device to host...\n"); CHECK_CUDA_ERROR(cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost)); // Step 8: Print the result (optional for small matrices) printf("Result Matrix C (M x N):\n"); printMatrix(h_C, M, N); // Step 9: Free device memory CHECK_CUDA_ERROR(cudaFree(d_A)); CHECK_CUDA_ERROR(cudaFree(d_B)); CHECK_CUDA_ERROR(cudaFree(d_C)); // Step 10: Free host memory free(h_A); free(h_B); free(h_C); printf("Matrix multiplication completed and memory freed.\n"); return 0; }
Output
Clear
ADVERTISEMENTS