Example: CUDA Matrix Multiplication in C

// CUDA Matrix Multiplication
#include <stdio.h>
#include <stdlib.h> // For malloc and free
#include <cuda_runtime.h> // For CUDA API functions

// Define matrix dimensions
#define M 256
#define K 256
#define N 256

// CUDA error check macro
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
void check(cudaError_t err, const char* const func, const char* const file, const int line) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA Error at %s:%d: %s %s\n", file, line, func, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

// CUDA kernel to perform matrix multiplication
__global__ void matrixMulKernel(float* A, float* B, float* C, int widthA, int heightA, int widthB, int heightB) {
    // Calculate the row and column of the C element to be computed by this thread
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Check if the thread is within the bounds of the output matrix C
    if (row < heightA && col < widthB) {
        float sum = 0.0f;
        for (int i = 0; i < widthA; ++i) { // widthA == heightB (K dimension)
            sum += A[row * widthA + i] * B[i * widthB + col];
        }
        C[row * widthB + col] = sum;
    }
}

// Function to initialize matrices
void initializeMatrix(float* matrix, int rows, int cols) {
    for (int i = 0; i < rows * cols; ++i) {
        matrix[i] = rand() / (float)RAND_MAX; // Random float between 0 and 1
    }
}

// Function to print a matrix (for small matrices)
void printMatrix(float* matrix, int rows, int cols) {
    if (rows > 10 || cols > 10) { // Limit printing for large matrices
        printf("Matrix too large to print.\n");
        return;
    }
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            printf("%.2f ", matrix[i * cols + j]);
        }
        printf("\n");
    }
}

int main() {
    // Step 1: Allocate memory on host (CPU)
    float *h_A, *h_B, *h_C;
    size_t sizeA = M * K * sizeof(float);
    size_t sizeB = K * N * sizeof(float);
    size_t sizeC = M * N * sizeof(float);

    h_A = (float*)malloc(sizeA);
    h_B = (float*)malloc(sizeB);
    h_C = (float*)malloc(sizeC);

    if (h_A == NULL || h_B == NULL || h_C == NULL) {
        fprintf(stderr, "Failed to allocate host memory!\n");
        exit(EXIT_FAILURE);
    }

    // Step 2: Initialize host matrices
    printf("Initializing host matrices...\n");
    initializeMatrix(h_A, M, K);
    initializeMatrix(h_B, K, N);

    // Optional: Print small matrices
    printf("Matrix A (M x K):\n");
    printMatrix(h_A, M, K);
    printf("Matrix B (K x N):\n");
    printMatrix(h_B, K, N);

    // Step 3: Allocate memory on device (GPU)
    float *d_A, *d_B, *d_C;
    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_A, sizeA));
    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_B, sizeB));
    CHECK_CUDA_ERROR(cudaMalloc((void**)&d_C, sizeC));

    // Step 4: Copy input matrices from host to device
    printf("Copying data from host to device...\n");
    CHECK_CUDA_ERROR(cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice));
    CHECK_CUDA_ERROR(cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice));

    // Step 5: Define grid and block dimensions for the kernel launch
    // A common choice is 16x16 or 32x32 threads per block
    int THREADS_PER_BLOCK_X = 16;
    int THREADS_PER_BLOCK_Y = 16;

    dim3 dimBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
    dim3 dimGrid(
        (N + dimBlock.x - 1) / dimBlock.x, // Grid width (cols of C)
        (M + dimBlock.y - 1) / dimBlock.y  // Grid height (rows of C)
    );

    // Step 6: Launch the CUDA kernel
    printf("Launching CUDA kernel...\n");
    matrixMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, K, M, N, K); // K is widthA and heightB
    CHECK_CUDA_ERROR(cudaGetLastError()); // Check for errors during kernel launch

    // Step 7: Copy the result matrix from device to host
    printf("Copying result from device to host...\n");
    CHECK_CUDA_ERROR(cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost));

    // Step 8: Print the result (optional for small matrices)
    printf("Result Matrix C (M x N):\n");
    printMatrix(h_C, M, N);

    // Step 9: Free device memory
    CHECK_CUDA_ERROR(cudaFree(d_A));
    CHECK_CUDA_ERROR(cudaFree(d_B));
    CHECK_CUDA_ERROR(cudaFree(d_C));

    // Step 10: Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    printf("Matrix multiplication completed and memory freed.\n");
    return 0;
}