当前位置：网站首页>CUDA realizes matrix multiplication

CUDA realizes matrix multiplication

2022-06-30 08:31:00 【Wu lele~】

List of articles

Preface
1、 Simple ideas
analysis
2、 Optimize
summary

Preface

This paper mainly uses CUDA Realize matrix multiplication .

1、 Simple ideas

#include <stdio.h>

#define BLOCK_NUM 8
#define THREAD_NUM 32
#define R_SIZE BLOCK_NUM * THREAD_NUM
#define M_SIZE R_SIZE*R_SIZE

void __global__ matmul1(int *da, int *db, int *dres);

void __global__ matmul1(int *da, int *db, int *dres)
{
    
    //  Get the absolute number of each thread , in total 256 strip 
    int tid = blockDim.x * blockIdx.x + threadIdx.x; 
    //  Each thread calculates a row of data in the result matrix 
    //  With tid = 0  For example , Need to add up 
    for(int c=0; c<R_SIZE; ++c)
    {
    
        for(int r=0; r<R_SIZE; ++r)
	    dres[tid*R_SIZE + c] += da[tid*R_SIZE+r] * db[r*R_SIZE+c];
    }
}


int main(int argc, char *argv[])
{
    
    // Allocate host memory 
    int *ha, *hb, *hres;
    ha = (int *) malloc (sizeof(int) * M_SIZE);
    hb = (int *) malloc (sizeof(int) * M_SIZE);
    hres = (int *) malloc(sizeof(int) * M_SIZE);

    // assignment 
    for(int i=0; i<R_SIZE; ++i)
    {
    
        for(int j=0; j<R_SIZE; ++j)
	{
    
	    ha[i*R_SIZE+j] = 1;
	    hb[i*R_SIZE+j] = 1;
	    hres[i*R_SIZE+j] = 0; 
	}
    }
    //  Allocate equipment internal lubrication 
    int *da, *db, *dres;
    cudaMalloc((void**)&da, sizeof(int)*M_SIZE);
    cudaMalloc((void**)&db, sizeof(int)*M_SIZE);
    cudaMalloc((void**)&dres, sizeof(int)*M_SIZE);

    //  Copy the data 
    cudaMemcpy(da,ha, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(db,hb, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(dres, hres, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);

    //  Call the kernel function 
    matmul1<<<BLOCK_NUM,THREAD_NUM>>>(da,db,dres);

    //  Copy the data 
    cudaMemcpy(hres, dres, sizeof(int)*M_SIZE, cudaMemcpyDeviceToHost);
    
    //  Print to see 
    printf("%d\n",hres[0]);

    // Free memory 
    free(ha);
    free(hb);
    free(hres);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dres);

    return 0;
}

analysis

First defined 256 Threads , The number of threads is equal to the number of rows in the matrix . In kernel function , Variable tid Get the of each thread ID. namely [0~255]. Corresponding to the final matrix 256 That's ok . That is, a thread needs to calculate the result matrix of one row . hypothesis tid =0, Then we analyze the double loop in the kernel function , Get separately da The row elements of a matrix and db The column elements of the matrix are multiplied and accumulated to obtain the final solution of the corresponding position .
Matrix multiplication optimization will be introduced later , Remove one layer according to reasonable thread arrangement for loop .

2、 Optimize

#include <stdio.h>

#define BLOCK_NUM 8
#define THREAD_NUM 32
#define R_SIZE BLOCK_NUM * THREAD_NUM
#define M_SIZE R_SIZE*R_SIZE

void __global__ matmul2(int *da, int *db, int *dres);

void __global__ matmul2(int *da, int *db, int *dres)
{
    
    //  Get the of each thread ID,  Number ID:(row,col). Corresponding to the result matrix   That's ok   and   Column 
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int col = blockDim.x * blockIdx.x + threadIdx.x; 
    //  The result of each thread , A thread corresponds to an element of a result matrix 
    for(int i=0; i<R_SIZE; ++i)
    {
    
        dres[row*R_SIZE + col] += da[row*R_SIZE+i] * db[i*row+col];
    }
}


int main(int argc, char *argv[])
{
    
    // Allocate host memory 
    int *ha, *hb, *hres;
    ha = (int *) malloc (sizeof(int) * M_SIZE);
    hb = (int *) malloc (sizeof(int) * M_SIZE);
    hres = (int *) malloc(sizeof(int) * M_SIZE);

    // assignment 
    for(int i=0; i<R_SIZE; ++i)
    {
    
        for(int j=0; j<R_SIZE; ++j)
	{
    
	    ha[i*R_SIZE+j] = 1;
	    hb[i*R_SIZE+j] = 1;
	    hres[i*R_SIZE+j] = 0; 
	}
    }
    //  Allocate equipment internal lubrication 
    int *da, *db, *dres;
    cudaMalloc((void**)&da, sizeof(int)*M_SIZE);
    cudaMalloc((void**)&db, sizeof(int)*M_SIZE);
    cudaMalloc((void**)&dres, sizeof(int)*M_SIZE);

    //  Copy the data 
    cudaMemcpy(da,ha, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(db,hb, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(dres, hres, sizeof(int)*M_SIZE, cudaMemcpyHostToDevice);
    
    //  Call the kernel function 
    //  Assign threads 
    const dim3 grid_size(BLOCK_NUM, BLOCK_NUM);
    const dim3 block_size(THREAD_NUM, THREAD_NUM);

    matmul2<<<grid_size, block_size>>>(da,db,dres);

    //  Copy the data 
    cudaMemcpy(hres, dres, sizeof(int)*M_SIZE, cudaMemcpyDeviceToHost);
    
    //  Print to see 
    printf("%d\n",hres[0]);

    // Free memory 
    free(ha);
    free(hb);
    free(hres);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dres);

    return 0;
}