如何写网站建设实验结果分析,长治软件制作平台,扁平化设计网站 源码,成都大型商城网站建设矩阵乘可以利用gpu多线程并行的特点进行加速计算#xff0c;但是传统简单的方法需要多次读取数据到寄存器中#xff0c;增加耗时#xff0c;因此利用gpu的共享内存可以被一个block内的所有线程访问到的特性#xff0c;结合tiling技术进行加速计算。 理论部分不解释了#…矩阵乘可以利用gpu多线程并行的特点进行加速计算但是传统简单的方法需要多次读取数据到寄存器中增加耗时因此利用gpu的共享内存可以被一个block内的所有线程访问到的特性结合tiling技术进行加速计算。 理论部分不解释了网上有很多关键在于网上很多利用共享内存计算的代码存在错误大部分只有在设置blockDim.x blockDim.y 的时候凑巧能对齐index给出正确的结果若这俩不等结果就错了这里给出一个修正的版本
#include stdio.h
#include stdlib.h
#include time.h
#include math.h
#include assert.h
#include cuda_runtime.h
#include device_launch_parameters.h#define M 32
#define K 32
#define N 32void initial(float *array, int size)
{for (int i 0; i size; i){array[i] (float)(1);}
}void printMatrix(float *array, int row, int col)
{float *p array;for (int y 0; y row; y){for (int x 0; x col; x){printf(%.2f , p[x]);}p p col;printf(\n);}return;
}__global__ void multiplicateMatrixOnDevice(float *array_A, float *array_B, float *array_C, int M_p, int K_p, int N_p)
{int ix threadIdx.x blockDim.x*blockIdx.x;//row numberint iy threadIdx.y blockDim.y*blockIdx.y;//col numberif (ix N_p iy M_p){float sum 0;for (int k 0; k K_p; k){sum array_A[iy*K_p k] * array_B[k*N_p ix];}array_C[iy*N_p ix] sum;}
}// Compute C A * B
// M, K, K, N, M, N
__global__ void matrixMultiplyShared(float *A, float *B, float *C,int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns)
{// Insert code to implement matrix multiplication here// You have to use shared memory for this MP// 1. 相比网上代码修改这里的index__shared__ float sharedM[8][16]; __shared__ float sharedN[16][8]; int bx blockIdx.x; int by blockIdx.y; int tx threadIdx.x; int ty threadIdx.y; int row by * blockDim.y ty; int col bx * blockDim.x tx; float Csub 0.0;// for (int i 0; i 2; i) for (int i 0; i (int)(ceil((float)numAColumns / blockDim.x)); i){if (i*blockDim.x tx numAColumns row numARows)sharedM[ty][tx] A[row*numAColumns i*blockDim.x tx];elsesharedM[ty][tx] 0.0;// 2. 相比网上代码修改这里的indexif (i*blockDim.x tx numBRows col numBColumns)sharedN[tx][ty] B[(i*blockDim.x tx)*numBColumns col];elsesharedN[tx][ty] 0.0;__syncthreads();// if (blockIdx.x 0 blockIdx.y 1 threadIdx.x 0 threadIdx.y 0 ) {// printf(sharedM: \n);// for (int i 0; i 8; i) {// for (int j 0; j 16; j) {// printf(%f , sharedM[i][j]);// }// printf(\n);// }// printf(sharedN: \n);// for (int i 0; i 16; i) {// for (int j 0; j 8; j) {// printf(%f , sharedM[i][j]);// }// printf(\n);// }// }for (int j 0; j blockDim.x; j)// 3. 相比网上代码修改这里的indexCsub sharedM[ty][j] * sharedN[j][ty];__syncthreads();}if (row numCRows col numCColumns)C[row*numCColumns col] Csub;}int main(int argc, char **argv)
{clock_t start 0, finish 0;float time;int Axy M * K;int Bxy K * N;int Cxy M * N;float *h_A, *h_B, *hostRef, *deviceRef;h_A (float*)malloc(Axy * sizeof(float));h_B (float*)malloc(Bxy * sizeof(float));int nBytes M * N * sizeof(float);hostRef (float*)malloc(Cxy * sizeof(float));deviceRef (float*)malloc(Cxy * sizeof(float));initial(h_A, Axy);initial(h_B, Bxy);// printMatrix(h_A, M, K);float *d_A, *d_B, *d_C;cudaMalloc((void**)d_A, Axy * sizeof(float));cudaMalloc((void**)d_B, Bxy * sizeof(float));cudaMalloc((void**)d_C, Cxy * sizeof(float));cudaMemcpy(d_A, h_A, Axy * sizeof(float), cudaMemcpyHostToDevice);cudaMemcpy(d_B, h_B, Bxy * sizeof(float), cudaMemcpyHostToDevice);int dimx 16;int dimy 16;dim3 block(dimx, dimy);dim3 grid((M block.x - 1) / block.x, (N block.y - 1) / block.y);cudaEvent_t gpustart, gpustop;float elapsedTime 0.0;cudaEventCreate(gpustart);cudaEventCreate(gpustop);cudaEventRecord(gpustart, 0);// multiplicateMatrixOnDevicegrid,block (d_A, d_B, d_C, M, K, N);matrixMultiplyShared grid, block (d_A, d_B, d_C, M, K, K, N, M, N);cudaDeviceSynchronize();cudaEventRecord(gpustop, 0);cudaEventSynchronize(gpustop);cudaEventElapsedTime(elapsedTime, gpustart, gpustop);cudaEventDestroy(gpustart);cudaEventDestroy(gpustop);cudaMemcpy(deviceRef, d_C, Cxy * sizeof(float), cudaMemcpyDeviceToHost);printMatrix(deviceRef, M, N);return 0;
}