并行程序设计(英)大作业——稀疏神经网络推理
稀疏神经网络前向传播
作业背景与要求
本次作业是 2019 的一个并行竞赛题,也不记得是那个竞赛了,反正题目大致要求下述所示:将一个 60000 * 1024 的矩阵 A 当做左乘矩阵,120 个 1024 * 1024 的稀疏矩阵 B [i], 作为右乘矩阵。A 与 B[i] 相乘得到矩阵 C,C 的维度为 60000 * 1024,对 C 矩阵加 偏置项 bias,并用激活函数 relu 激活,激活后的 C 矩阵作为下一次相乘的矩阵 A 与 B[i+1] 相乘。即神经网络前向传播一次。运算时间通过 gettimeofday(&time,NULL); 获取。
原始代码是通过传统的矩阵乘法,利用三层循环,每个内部循环计算所求矩阵的一个元素,时间复杂度为 O(n^3^),计算一次矩阵相乘的时间需要 10 分钟左右。120个计算下来不得了。所以本次作业的目的就是尽量优化代码,缩短矩阵相乘和 bias+relu 的时间。老师给上一届优化最快的一个小组奖励了樱花键盘,我们这一届也不知道会不会有,反正我不是最快的。
优化思路来源
本学期一共有 5 次家庭作业,4 次都是关于优化矩阵乘法的,我就是筛选出性能最好的一种算法应用于大作业。
- 第一次作业是两个方阵相乘,方阵大小在 100 ~ 2000 之间,并100 为步长共 20 组数据测试。改变循环顺序和使用 openmp 还有 openblas。
上图为改变循环顺序的结果。
上图为改变循环顺序后加openmp的结果,有两个顺序加openmp语句后输出结果不正确,就没有画出来。
上图为 openblas 的结果。
void gemm_OpenBlas(double *A, double *B, double *C, int m, int k, int n) { enum CBLAS_ORDER order = CblasRowMajor; enum CBLAS_TRANSPOSE transposeA = CblasNoTrans; enum CBLAS_TRANSPOSE transposeB = CblasNoTrans; double alpha = 1; double beta = 1; cblas_dgemm(order,transposeA,transposeB,m,n,k,alpha,A,k,B,n,beta,C,n); } gemm_OpenBlas(A, B, C_yours, m, k, n);
很快啊! 的两个矩阵相乘只需要 0.25s。
- 作业三是使用 CUDA 函数继续作业一。
//cublasSgemm cublasHandle_t s; cublasCreate(&s); cublasSgemm('N', 'N', m, n, k, 1.0f, d_A, m, d_B, k, 0, d_C, m); cublasDestroy(s); //cublasSgemm_v2 cublasHandle_t s; cublasCreate_v2(&s); cublasSgemm_v2(s,CUBLAS_OP_T,CUBLAS_OP_T,m,n,k,&al,d_A,k,d_B,n,&ve,d_C,n); cublasDestroy_v2(s); //cblas_sgemm void gemm_OpenBlas(float *A, float *B, float *C, int m, int k, int n) { enum CBLAS_ORDER order = CblasRowMajor; enum CBLAS_TRANSPOSE transposeA = CblasNoTrans; enum CBLAS_TRANSPOSE transposeB = CblasNoTrans; float alpha = 1; float beta = 1; cblas_sgemm(order,transposeA,transposeB,m,n,k,alpha,A,k,B,n,beta,C,n); } //cuda_shared typedef struct { int width; int height; int stride; float *elements; } Matrix; __device__ float GetElement(const Matrix A, int row, int col) { return A.elements[row * A.stride + col]; } __device__ void SetElement(Matrix A, int row, int col, float value) { A.elements[row * A.stride + col] = value; } __device__ Matrix GetSubMatrix(Matrix A, int row, int col) { Matrix Asub; Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE; Asub.stride = A.stride; Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]; return Asub; } #define ifin(r, c, rb, cb, mat_A) \ ((c + cb*BLOCK_SIZE < mat_A.width) && (r+rb*BLOCK_SIZE < mat_A.height)) __global__ void MatMulKernel_SharedMemory(Matrix A, Matrix B, Matrix C) { int blockRow = blockIdx.y; int blockCol = blockIdx.x; int row = threadIdx.y; int col = threadIdx.x; __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; Matrix subC = GetSubMatrix(C, blockRow, blockCol); int tot = A.width / BLOCK_SIZE + (A.width % BLOCK_SIZE != 0); float CValue = 0; for (int i = 0; i < tot; ++i) { Matrix Asub = GetSubMatrix(A, blockRow, i); Matrix Bsub = GetSubMatrix(B, i, blockCol); //if (i * BLOCK_SIZE + col < A.width) if(ifin(row,col,blockRow,i,A)) { As[row][col] = GetElement(Asub, row, col); } else As[row][col] = 0; if(ifin(row,col,i,blockCol,B)) { Bs[row][col] = GetElement(Bsub, row, col); }else Bs[row][col] = 0; __syncthreads(); for (int e = 0; e < BLOCK_SIZE; ++e) CValue += As[row][e] * Bs[e][col]; __syncthreads(); if (ifin(row, col, blockRow, blockCol, C)) SetElement(subC, row, col, CValue); } } void gemm_cuda_shared(float *A, float *B, float *C, int m, int k, int n,double *time_value) { Matrix d_A; d_A.width = d_A.stride = k; d_A.height = m; size_t size = k * m * sizeof(float); cudaMalloc(&d_A.elements, size); cudaMemcpy(d_A.elements, A, size, cudaMemcpyHostToDevice); Matrix d_B; d_B.width = d_B.stride = n; d_B.height = k; size = n * k * sizeof(float); cudaMalloc(&d_B.elements, size); cudaMemcpy(d_B.elements, B, size, cudaMemcpyHostToDevice); Matrix d_C; d_C.width = d_C.stride = n; d_C.height = m; size = n * m * sizeof(float); cudaMalloc(&d_C.elements, size); dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid((n / dimBlock.x) + (n % dimBlock.x != 0), (m / dimBlock.y) + (m % dimBlock.y != 0)); timeval t1,t2; *time_value = 0; cudaDeviceSynchronize(); gettimeofday(&t1, nullptr); MatMulKernel_SharedMemory<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); cudaDeviceSynchronize(); gettimeofday(&t2, nullptr); *time_value+=(t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000000.0; cudaMemcpy(C, d_C.elements, size, cudaMemcpyDeviceToHost); cudaFree(d_A.elements); cudaFree(d_B.elements); cudaFree(d_C.elements); } //cuda_global __global__ void MatrixMulKernel(float *A, float *B, float *C, int m, int k, int n) { // Each thread computes one element of C // by accumulating results into Cvalue float Cvalue = 0; int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row < m && col < n) { for (int e = 0; e < k; ++e) Cvalue += A[row * k + e] * B[e * n + col]; C[row * n + col] = Cvalue; } } void gemm_cuda(float *A, float *B, float *C, int m, int k, int n,double *time_value) { float *d_A, *d_B, *d_C; size_t size = m * k * sizeof(float); cudaMalloc(&d_A, size); cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice); size = k * n * sizeof(float); cudaMalloc(&d_B, size); cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice); size = m * n * sizeof(float); cudaMalloc(&d_C, size); dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid((n / dimBlock.x) + (n % dimBlock.x != 0), (m / dimBlock.y) + (m % dimBlock.y != 0)); timeval t1,t2; *time_value = 0; cudaDeviceSynchronize(); gettimeofday(&t1, nullptr); MatrixMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, m, k, n); cudaDeviceSynchronize(); gettimeofday(&t2, nullptr); *time_value += (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000000.0; cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); }
最快的是 cublas_sgemm 的矩阵只用了大约 0.0075s。其次就是 cublas_sgemm_v2 耗时 0.0175s 左右。
- 作业四,是稀疏方阵乘瘦矩阵,就是将方阵转化为 csr 的存储格式。然后再按照csr的方式将矩阵相乘。虽然使用csr方式进行计算耗费时间较少,但是需要的前期准备太多,而且在多个矩阵相乘过程中需要多次转换矩阵形式与分配内存,每次参与运算矩阵的 csr 格式表示都不相同,csr 的三个数组长度也不同,所以每次都需重新分配内存、释放内存,会带来较多的时间消耗。
虽然利用 csr 格式的稀疏矩阵相乘有以上的缺点,但还是对此方法进行了实验,下面是在网上拼凑的代码,原理不是很明白,只是粘取了函数部分,然后在调用函数部分做了改动。将矩阵转化为 csr 存储格式后利用cusparseSpMM:
#include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <stdbool.h> #include "mmio.h" #include "mmiohighlevel.h" #include <omp.h> #define nthreads 1024 //#ifndef MATRIXMULTISAMPLES_CUSPARSE_SPMM_CUH //#define MATRIXMULTISAMPLES_CUSPARSE_SPMM_CUH #include <cusparse.h> typedef struct { VALUE_TYPE *value; int *columnindex; int *rowpointer; }SMatrix; void dense2csr(int m, int n, VALUE_TYPE *C0, VALUE_TYPE *value, int *columnindex, int *rowpointer){ VALUE_TYPE *d_C0; size_t size = m * n * sizeof(VALUE_TYPE); cudaMalloc(&d_C0, size); cudaMemcpy(d_C0, C0, size, cudaMemcpyHostToDevice); size = m * sizeof(int); int *nnzPerRow = 0; cudaMalloc(&nnzPerRow, size); int nzz = 0; cusparseHandle_t handle = 0; cusparseCreate(&handle); cusparseMatDescr_t descrA = 0; cusparseCreateMatDescr(&descrA); cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; cusparseSnnz(handle,dirA,m,n,descrA, d_C0,m,nnzPerRow,&nzz); float *csrValA; size = nzz * sizeof(VALUE_TYPE); cudaMalloc(&csrValA, size); int *csrRowPtrA, *csrColIndA; size = (m+1) * sizeof(int); cudaMalloc(&csrRowPtrA, size); size = nzz * sizeof(int); cudaMalloc(&csrColIndA, size); cusparseSdense2csr(handle,m,n,descrA,d_C0, m,nnzPerRow,csrValA,csrRowPtrA,csrColIndA); size = nzz * sizeof(VALUE_TYPE); cudaMemcpy(value, csrValA, size, cudaMemcpyDeviceToHost); size = (m+1) * sizeof(int); cudaMemcpy(rowpointer, csrRowPtrA, size, cudaMemcpyDeviceToHost); size = nzz * sizeof(int); cudaMemcpy(columnindex, csrColIndA, size, cudaMemcpyDeviceToHost); cudaFree(d_C0); cudaFree(csrValA); cudaFree(csrRowPtrA); cudaFree(csrColIndA); } void scan(int *array, int len) { int old, _new; old = array[0]; array[0] = 0; for (int i = 1; i < len; i++) { _new = array[i]; array[i] = old + array[i - 1]; old = _new; } } void toRowIndx_(int line, int ld, VALUE_TYPE *val) { VALUE_TYPE *temp = (VALUE_TYPE *) malloc(sizeof(VALUE_TYPE) * line * ld); for (int i = 0; i < line; ++i) { for (int j = 0; j < ld; ++j) { temp[i * ld + j] = val[j * line + i]; } } memcpy(val, temp, sizeof(VALUE_TYPE) * line * ld); free(temp); } void toColIndx_(int line, int ld, VALUE_TYPE *val) { VALUE_TYPE *temp = (VALUE_TYPE *) malloc(sizeof(VALUE_TYPE) * line * ld); for (int i = 0; i < ld; ++i) { for (int j = 0; j < line; ++j) { temp[i * line + j] = val[j * ld + i]; } } memcpy(val, temp, sizeof(VALUE_TYPE) * line * ld); free(temp); } int main(int argc, char ** argv) { struct timeval t1, t2, t3, t4; int size1 = 0; int size2 = 0; int *tc1; int *tc2; double bias = -0.3000; int mA; int nA; int nnzA; int isSymmetricA; SMatrix A; int mB; int nB; int nnzB; int isSymmetricB; SMatrix B[120]; int mC,nC; omp_set_num_threads(nthreads); // load matrix data from file gettimeofday(&t3, NULL); char filename1[]="sparse-images-1024.tsv"; mmio_info(&mA, &nA, &nnzA, &isSymmetricA, filename1); A.value=(VALUE_TYPE*)malloc((nnzA)*sizeof(VALUE_TYPE)); A.columnindex=(int*)malloc((nnzA)*sizeof(int)); A.rowpointer=(int*)malloc((mA+1)*sizeof(int)); mmio_data(A.rowpointer, A.columnindex, A.value, filename1); printf("input matrix A: ( %i, %i ) nnz = %i\n", mA, nA, nnzA); /* VALUE_TYPE *A0 = (VALUE_TYPE *)malloc(mA * nA * sizeof(VALUE_TYPE)); memset(A0, 0, sizeof(VALUE_TYPE) * mA * nA); for (int i = 0; i < mA; i++) { for (int j = A.rowpointer[i]; j < A.rowpointer[i+1]; j++) { A0[i * nA + A.columnindex[j]] = A.value[j]; } } free(A.rowpointer); free(A.columnindex); free(A.value); */ char neuronfile1[] = "neuron1024/n1024-l"; char neuronfile2[] = ".tsv"; char filename3[60]; VALUE_TYPE *B0[120]; for (int k = 0; k < 120; k++) { char filenum[5]; int k1=k+1; snprintf(filenum,sizeof(filenum),"%d",k1); strcpy(filename3, neuronfile1); strcat(filename3, filenum); strcat(filename3, neuronfile2); mmio_info(&mB, &nB, &nnzB, &isSymmetricB, filename3); B[k].value=(VALUE_TYPE*)malloc((nnzB)*sizeof(VALUE_TYPE)); B[k].columnindex=(int*)malloc((nnzB)*sizeof(int)); B[k].rowpointer=(int*)malloc((mB+1)*sizeof(int)); mmio_data(B[k].rowpointer, B[k].columnindex, B[k].value, filename3); B0[k] = (VALUE_TYPE *)malloc(mB * nB * sizeof(VALUE_TYPE)); memset(B0[k], 0, sizeof(VALUE_TYPE) * mB * nB); for (int i = 0; i < mB; i++) { for (int j = B[k].rowpointer[i]; j < B[k].rowpointer[i+1]; j++) { B0[k][i * nB + B[k].columnindex[j]] = B[k].value[j]; } } free(B[k].rowpointer); free(B[k].columnindex); free(B[k].value); } gettimeofday(&t4,NULL); double time_load = (t4.tv_sec - t3.tv_sec) * 1000.0 + (t4.tv_usec - t3.tv_usec) / 1000.0; printf("Weight matrix load time: %f ms \n",time_load); mC = mA; nC = nB; SMatrix C; C.value=(VALUE_TYPE*)malloc((mA*nB)*sizeof(VALUE_TYPE)); C.columnindex=(int*)malloc((mA*nB)*sizeof(int)); C.rowpointer=(int*)malloc((mA+1)*sizeof(int)); VALUE_TYPE *C0 =(VALUE_TYPE *)malloc((mA*nB)*sizeof(VALUE_TYPE)); float *dRes; VALUE_TYPE *d_B[120]; for (int k=0;k<120;k++) { size_t size = nA * nB * sizeof(VALUE_TYPE); toColIndx_(nB,mB,B0[k]); cudaMalloc(&d_B[k], size); cudaMemcpy(d_B[k], B0[k], size, cudaMemcpyHostToDevice); } gettimeofday(&t3, NULL); for (int k = 0; k < 120; k++) { //memset(C0, 0, sizeof(VALUE_TYPE)*mC*nC); int *dRow, *dCol; size_t size = (mA + 1) * sizeof(int); cudaMalloc(&dRow, size); cudaMemcpy(dRow, A.rowpointer, size, cudaMemcpyHostToDevice); size = A.rowpointer[mA]* sizeof(int); cudaMalloc(&dCol, size); cudaMemcpy(dCol, A.columnindex, size, cudaMemcpyHostToDevice); VALUE_TYPE *dA, *dB0; size = A.rowpointer[mA]* sizeof(VALUE_TYPE); cudaMalloc(&dA, size); cudaMemcpy(dA, A.value, size, cudaMemcpyHostToDevice); size = mB * nB* sizeof(VALUE_TYPE); cudaMalloc(&dB0, size); cudaMemcpy(dB0, d_B[k], size, cudaMemcpyDeviceToDevice); size = mA* nA * sizeof(VALUE_TYPE); cudaMalloc(&dRes, size); cusparseHandle_t handle; cusparseCreate(&handle); cusparseOperation_t a = CUSPARSE_OPERATION_NON_TRANSPOSE; cusparseOperation_t b = CUSPARSE_OPERATION_NON_TRANSPOSE; VALUE_TYPE al = 1, be = 0; cusparseSpMatDescr_t csrMtxA; cusparseCreateCsr(&csrMtxA, (int64_t) mA, (int64_t) nA, (int64_t) nnzA, dRow, dCol, dA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); cusparseDnMatDescr_t dnsMtxB; cusparseCreateDnMat(&dnsMtxB, (int64_t) mB, (int64_t) nB, (int64_t) nB, dB0, CUDA_R_32F, CUSPARSE_ORDER_COL); cusparseDnMatDescr_t dnsMtxC; cusparseCreateDnMat(&dnsMtxC, (int64_t) mA, (int64_t) nA, (int64_t) mA, dRes, CUDA_R_32F, CUSPARSE_ORDER_COL); gettimeofday(&t1,NULL); cusparseSpMM(handle, a, b, &al, csrMtxA, dnsMtxB, &be, dnsMtxC, CUDA_R_32F, CUSPARSE_MM_ALG_DEFAULT, NULL); gettimeofday(&t2,NULL); double time_gemm = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0; gettimeofday(&t1,NULL); gettimeofday(&t2,NULL); gettimeofday(&t1,NULL); cudaMemcpy(C0, dRes,(mC*nC)*sizeof(VALUE_TYPE) , cudaMemcpyDeviceToHost); cudaFree(dCol); cudaFree(dRow); cudaFree(dRes); cudaFree(dA); cudaFree(dB0); #pragma omp parallel for for (int i = 0; i < mC*nC; i++) { C0[i] += bias; if (C0[i] <= 0) { C0[i] = 0; } else if (C0[i] >= 32) { C0[i] = 32; } } int *rowpointer, *columnindex; VALUE_TYPE* value; columnindex = (int*)malloc((mA*nA)*sizeof(int)); rowpointer = (int*)malloc((mA+1)*sizeof(int)); value = (VALUE_TYPE*)malloc(mA*nA*sizeof(VALUE_TYPE)); dense2csr(mA, nB, C0, value, columnindex, rowpointer); A.rowpointer = (int*)malloc((mA+1)*sizeof(int)); A.columnindex = (int*)malloc(rowpointer[mA]*sizeof(int)); A.value = (VALUE_TYPE*)malloc(rowpointer[mA]*sizeof(VALUE_TYPE)); memcpy(A.rowpointer, rowpointer, (mA+1)*sizeof(int)); memcpy(A.columnindex, columnindex, rowpointer[mA]*sizeof(int)); memcpy(A.value, value, rowpointer[mA]*sizeof(VALUE_TYPE)); gettimeofday(&t2,NULL); double time_biasrelu = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0; printf("k = %d, GEMM time: %4.5f ms, Bias+ReLU time: %4.5f ms\n", k+1, time_gemm, time_biasrelu); } //cudaMemcpy(A0, d_C, (mC*nC)*sizeof(VALUE_TYPE),cudaMemcpyDeviceToHost); gettimeofday(&t4,NULL); double time_inference = (t4.tv_sec - t3.tv_sec) * 1000.0 + (t4.tv_usec - t3.tv_usec) / 1000.0; printf("Inference time: %f ms \n",time_inference); //free(C0); VALUE_TYPE *A0 = (VALUE_TYPE *)malloc(mA * nA * sizeof(VALUE_TYPE)); memset(A0, 0, sizeof(VALUE_TYPE) * mA * nA); for (int i = 0; i < mA; i++) { for (int j = A.rowpointer[i]; j < A.rowpointer[i+1]; j++) { A0[i * nA + A.columnindex[j]] = A.value[j]; } } // check results printf("test\n"); FILE* fs; fs=fopen("sparse-images-1024-1.tsv","w+"); for (int i = 0; i <mA; i++) { int sum =0; for (int j = (i*nA); j < ((i+1)*nA); j++) { sum+=A0[j]; } if(sum!=0) { fprintf(fs,"%d\n", i+1); } } fclose(fs); FILE* fp2=NULL; fp2 = fopen("sparse-images-1024-1.tsv", "rb"); if (fp2 == NULL) { printf("Error:Open file fail!\n"); } fseek(fp2, 0, SEEK_END); size2 = ftell(fp2); rewind(fp2); tc2 = (int*)malloc(sizeof(int) * size2/4); int readnum2 = fread(tc2, 4, size2/4, fp2); fclose(fp2); FILE* fp1; fp1 = fopen("neuron1024-l120-categories.tsv", "rb"); if (fp1 == NULL) { printf("Error:Open file fail!\n"); } fseek(fp1, 0, SEEK_END); size1 = ftell(fp1); rewind(fp1); tc1 = (int*)malloc(sizeof(int) * size1/4); int readnum1 = fread(tc1, 4, size1/4, fp1); fclose(fp1); int judge=0; for(int i=0;i<size1/4;i++) { if(tc1[i]-tc2[i] != 0) { judge++; } } printf("judge:%d\n",judge); if (judge == 0) { printf("CHALLENGE PASSED\n"); } else { printf("CHALLENGE FAILED\n"); } free(A0); return 0; }
这个时间大约是24s左右,bias和relu使用kernel的话会快几秒。相比于后面要介绍的的 cublasSgemm 还是要慢好多的。
- 作业五是使用 mpi 计算矩阵乘法。学这个的时候因为疫情被学校赶回家就随便水过去了。就不考虑它了。但有人用它达到了3s,是班里最快的。
矩阵乘法优化思路
选择最快的 cublasSgemm
cublasSgemm是NV cublas库的矩阵相乘API,由于 cublas 中矩阵的存储是列优先,所以cublasSgemm API的参数比较难配置,所以就先从 cublasSgemm 的参数配置入手,正确设置完cublasSgemm 的参数后再进行前向传播任务。
二维矩阵的行优先与列优先存储
矩阵在逻辑上表达为2维M行K列,但存储到内存的时候都是按一维布局,其中按行优先存储和按列优先存储的差异如上图所示。
如上图所示,当矩阵按行优先存储然后又按相反的列优先读取的话,就会得到元矩阵转置的结果;同理适用于按列优先存储然后按行优先读取。
cublasSgemm 函数参数
cublasStatus_t cublasSgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)
This function performs the matrix-matrix multiplication:
$$
where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op(A) m×k , op(B) k×n and C m×n , respectively. Also, for matrix A
and op(B) is defined similarly for matrix B .
官方文档的参数解释如下:
我们可以从中获取以下几个重点:
根据文档说可以知道,cublasSgemm完成了 C = alpha * op ( A ) * op ( B ) + beta * C 的矩阵乘加运算
其中alpha和beta是标量, A B C是以列优先存储的矩阵
如果 transa的参数是CUBLAS_OP_N 则op(A) = A ,如果是CUBLAS_OP_T 则op(A)=A的转置,至于CUBLAS_OP_C就不用管了,这次作业用不到。
如果 transb的参数是CUBLAS_OP_N 则op(B) = B ,如果是CUBLAS_OP_T 则op(B)=B的转置
由于API中的矩阵参数也用A B C表示,为了不和下面例子中的矩阵A B混淆,我们将cublasSgemm中的参数做如下的调整
- A称为乘法左矩阵
- B称为乘法右矩阵
- C称为结果矩阵
所以当alpha =1 并且 beta =0 的时候 cublasSgemm完成了计算:
$$
求解C=AxB
其中 A为mA行nA列 B为mB行nB列 所以 C为mC行nC列,并且六个参数有如下关系:
$$
不使用cublasSgemm transa与transb参数
由于C/C++程序中输入的A和B是按行存储,所以在的情况下,cublas其实读取到的是A和B的转置矩阵和
根据线性代数的规则可知C^T^ = (A x B)^T^ = B^T^ x A^T^ , 所以cublasSgemm API中几个参数设置如下:
- 设置了cublasSgemm的transa与transb参数=CUBLAS_OP_N
- 乘法左矩阵为B^T^=参数设置为B,乘法右矩阵为A^T^=参数设置为A
- 结果矩阵的行数为C^T^的行数=参数设置为nC
- 结果矩阵的列数为C^T^的列数=参数设置为mC
- 乘法左矩阵列与乘法右矩阵的行=参数设置为nA或mB
- 按列优先读取乘法左矩阵B的主维度(即B^T^有几行)=参数设置为nB
- 按列优先读取乘法右矩阵A的主维度(即A^T^有几行)=参数设置为nA
- 结果矩阵存储在参数C中,它的主维度(即C^T^有几行)= 参数设置为nC
cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,nC,mC,nA,&alpha,d_B,nB,d_A,nA,&beta,d_C,nC);
按上面的参数调用cublasSgemm API (矩阵A按行存储在指针d_a, 矩阵B按行存储在指针d_b, 矩阵C的存储空间指针d_c) 最后从结果矩阵的存储空间d_c中按行读取到的就是C=AxB的结果,整个cublasSgemm的计算过程如下图所示:
使用cublasSgemm transa与transb参数
由于C/C++程序中输入的A和B是按行存储,所以在的情况下,cublas其实读取到的是A和B的转置矩阵AT和BT
设置了cublasSgemm的transa与transb参数后可以在做矩阵运算前对读取到的AT和BT矩阵做一次转置,获得A和B
根据线性代数的规则可知C = A x B 所以cublasSgemm API中几个参数设置如下
设置了cublasSgemm的transa与transb参数=CUBLAS_OP_T,在进行矩阵运算前对读取的矩阵做一次转置
- 乘法左矩阵为A=参数设置为A,乘法右矩阵为B=参数设置为B
- 结果矩阵的行数为C的行数=参数设置为mC
- 结果矩阵的列数为C的列数=参数设置为nC
- 乘法左矩阵列与乘法右矩阵的行=参数设置为nA
- 按列优先读取乘法左矩阵A的主维度(即A^T^有几行)=参数设置为nA
- 按列优先读取乘法右矩阵B的主维度(即B^T^有几行)=参数设置为nB
- 结果矩阵存储在参数C中,它的主维度(即C有几行)= 参数设置为mC
cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_T, mC, nC, nA,&alpha,d_a, nA, d_b, nB,&beta, d_c, mC);
运行结果由于按行优先N行M列的顺序读取C相当于做了C的转置,得到C^T^ 。在计算中如果使用带有transa和transb参数,那么在运算结束还需要对结果矩阵进行转置,降低效率,所以在本次大作业中我们使用不带transa和transb参数的cublasSgemm语句进行运算。
代码与结果分析优化
代码
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid(ceil((float)nC / BLOCK_SIZE) , ceil((float)mC / BLOCK_SIZE) ); gettimeofday(&t3, NULL); for (int k = 0; k < 120; k++) { memset(C0, 0, sizeof(VALUE_TYPE)*mC*nC); VALUE_TYPE *d_A, *d_B, *d_C; cudaMalloc((void**)&d_A, sizeof(VALUE_TYPE)*mA*nA); cudaMalloc((void**)&d_B, sizeof(VALUE_TYPE)*mB*nB); cudaMalloc((void**)&d_C, sizeof(VALUE_TYPE)*mC*nC); cudaMemcpy(d_A, A0, sizeof(VALUE_TYPE)*mA*nA, cudaMemcpyHostToDevice); cudaMemcpy(d_B, B0[k], sizeof(VALUE_TYPE)*mB*nB, cudaMemcpyHostToDevice); cublasHandle_t handle; cublasCreate(&handle); gettimeofday(&t1, NULL); cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,nC,mC,nA,&alpha,d_B,mB,d_A,mB,&beta,d_C,nC); //cublasSgemm_v2(s ,CUBLAS_OP_N,CUBLAS_OP_T,m, n, k, &al, d_A,k, d_B,n, &ve, d_C,n); gettimeofday(&t2,NULL); cublasDestroy(handle); // cudaMemcpy(A0, d_C, sizeof(VALUE_TYPE)*mC*nC, cudaMemcpyDeviceToHost); cudaFree(d_A); cudaFree(d_B); // cudaFree(d_C); double time_gemm = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0; gettimeofday(&t1, NULL); RELUKernel<<<dimGrid, dimBlock>>>(d_C,nA); gettimeofday(&t2,NULL); double time_biasrelu = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0; printf("k = %d, GEMM time: %4.5f ms, Bias+ReLU time: %4.5f ms\n", k+1, time_gemm, time_biasrelu); cudaMemcpy(A0, d_C, sizeof(VALUE_TYPE)*mC*nC, cudaMemcpyDeviceToHost); cudaFree(d_C); }
下面为运行结果:
总共需要16.5s来完成一次前向传播。
结果分析优化
我们可以看到,用来计算矩阵相乘和bias+relu的时间开销相比于推理时间16s是极少的,所以我们应该着眼于代码其他方面的优化,比如内存分配,显存访问。
每次循环开始,使用memset将C0设置为0,定义矩阵d_A, d_B, d_C。用cudaMalloc分配空间然后用cudaMemcpy将矩阵d_A, d_B从host拷贝到device上。接着调用函数cublasSgemm,最后释放d_A,d_B,将d_C从device复制到host,然后释放d_C。本次循环结束。
可以优化的选项:
- 每次循环最后中都会把d_C传给A0,在下次循环中访问A0传给d_A,该步操作可以简化为:在循环外声明d_A,d_C,每次循环结束将d_C直接传给d_A,计算结束后不再释放d_A,d_C。循环结束时再释放。
- 对于d_B,虽然每次计算都需要从B中加载不同矩阵,但也可以在循环外面声明。
- 在120个矩阵乘完后,在循环外将d_C矩阵传给A0,然后即可释放d_C,d_A,d_B。
优化后的代码
#include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <stdbool.h> #include <cublas.h> #include "mmio.h" #include "mmiohighlevel.h" #define BLOCK_SIZE 32 typedef struct { VALUE_TYPE *value; int *columnindex; int *rowpointer; }SMatrix; __global__ void RELUKernel(VALUE_TYPE *C,int n) { VALUE_TYPE Cvalue; int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; C[row * n + col]=C[row * n + col]-0.3000; Cvalue=C[row * n + col]; if (Cvalue<=0){ C[row * n + col]=0.0;} else if(Cvalue>=32){ C[row * n + col]=32.0; } } int main(int argc, char ** argv) { struct timeval t1, t2, t3, t4; int size1 = 0; int size2 = 0; int *tc1; int *tc2; double bias = -0.3000; const float alpha = 1.0, beta = 0.0; int mA; int nA; int nnzA; int isSymmetricA; SMatrix A; int mB; int nB; int nnzB; int isSymmetricB; SMatrix B[120]; int mC,nC; int nnzC_golden = 0; // load matrix data from file gettimeofday(&t3, NULL); char filename1[]="sparse-images-1024.tsv"; mmio_info(&mA, &nA, &nnzA, &isSymmetricA, filename1); A.value=(VALUE_TYPE*)malloc((nnzA)*sizeof(VALUE_TYPE)); A.columnindex=(int*)malloc((nnzA)*sizeof(int)); A.rowpointer=(int*)malloc((mA+1)*sizeof(int)); mmio_data(A.rowpointer, A.columnindex, A.value, filename1); printf("input matrix A: ( %i, %i ) nnz = %i\n", mA, nA, nnzA); VALUE_TYPE *A0 = (VALUE_TYPE *)malloc(mA * nA * sizeof(VALUE_TYPE)); memset(A0, 0, sizeof(VALUE_TYPE) * mA * nA); for (int i = 0; i < mA; i++) { for (int j = A.rowpointer[i]; j < A.rowpointer[i+1]; j++) { A0[i * nA + A.columnindex[j]] = A.value[j]; } } free(A.rowpointer); free(A.columnindex); free(A.value); char neuronfile1[] = "neuron1024/n1024-l"; char neuronfile2[] = ".tsv"; char filename3[60]; VALUE_TYPE *B0[120]; for (int k = 0; k < 120; k++) { char filenum[5]; int k1=k+1; snprintf(filenum,sizeof(filenum),"%d",k1); strcpy(filename3, neuronfile1); strcat(filename3, filenum); strcat(filename3, neuronfile2); mmio_info(&mB, &nB, &nnzB, &isSymmetricB, filename3); B[k].value=(VALUE_TYPE*)malloc((nnzB)*sizeof(VALUE_TYPE)); B[k].columnindex=(int*)malloc((nnzB)*sizeof(int)); B[k].rowpointer=(int*)malloc((mB+1)*sizeof(int)); mmio_data(B[k].rowpointer, B[k].columnindex, B[k].value, filename3); B0[k] = (VALUE_TYPE *)malloc(mB * nB * sizeof(VALUE_TYPE)); memset(B0[k], 0, sizeof(VALUE_TYPE) * mB * nB); for (int i = 0; i < mB; i++) { for (int j = B[k].rowpointer[i]; j < B[k].rowpointer[i+1]; j++) { B0[k][i * nB + B[k].columnindex[j]] = B[k].value[j]; } } free(B[k].rowpointer); free(B[k].columnindex); free(B[k].value); } gettimeofday(&t4,NULL); double time_load = (t4.tv_sec - t3.tv_sec) * 1000.0 + (t4.tv_usec - t3.tv_usec) / 1000.0; printf("Weight matrix load time: %f ms \n",time_load); mC = mA; nC = nB; VALUE_TYPE *C0 =(VALUE_TYPE *)malloc((mC*nC)*sizeof(VALUE_TYPE)); dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid(ceil((float)nC / BLOCK_SIZE) , ceil((float)mC / BLOCK_SIZE) ); gettimeofday(&t3, NULL); VALUE_TYPE *d_A, *d_B, *d_C; cudaMalloc((void**)&d_A, sizeof(VALUE_TYPE)*mA*nA); cudaMalloc((void**)&d_B, sizeof(VALUE_TYPE)*mB*nB); cudaMalloc((void**)&d_C, sizeof(VALUE_TYPE)*mC*nC); cudaMemcpy(d_A, A0, sizeof(VALUE_TYPE)*mA*nA, cudaMemcpyHostToDevice); for (int k = 0; k < 120; k++) { cudaMemcpy(d_B, B0[k], sizeof(VALUE_TYPE)*mB*nB, cudaMemcpyHostToDevice); free(B0[k]); cublasHandle_t handle; cublasCreate(&handle); gettimeofday(&t1, NULL); // cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,nC,mC,nA,&alpha,d_B,mB,d_A,mB,&beta,d_C,nC); cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,nC,mC,nA,&alpha,d_B,nB,d_A,nA,&beta,d_C,nC); // cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_T, mC, nC, nA,&alpha,d_a, nA, d_b, nB,&beta, d_c, mC); gettimeofday(&t2,NULL); cublasDestroy(handle); double time_gemm = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0; gettimeofday(&t1, NULL); RELUKernel<<<dimGrid, dimBlock>>>(d_C,nA); gettimeofday(&t2,NULL); double time_biasrelu = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0; printf("k = %d, GEMM time: %4.5f ms, Bias+ReLU time: %4.5f ms\n", k+1, time_gemm, time_biasrelu); cudaMemcpy(d_A, d_C, sizeof(VALUE_TYPE)*mC*nC, cudaMemcpyDeviceToDevice); } cudaMemcpy(A0, d_A, sizeof(VALUE_TYPE)*mC*nC, cudaMemcpyDeviceToHost); gettimeofday(&t4,NULL); double time_inference = (t4.tv_sec - t3.tv_sec) * 1000.0 + (t4.tv_usec - t3.tv_usec) / 1000.0; printf("Inference time: %f ms \n",time_inference); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(C0); // check results printf("test\n"); FILE* fs; fs=fopen("sparse-images-1024-1.tsv","w+"); for (int i = 0; i <mA; i++) { int sum =0; for (int j = (i*nA); j < ((i+1)*nA); j++) { sum+=A0[j]; } if(sum!=0) { fprintf(fs,"%d\n", i+1); } } fclose(fs); FILE* fp2=NULL; fp2 = fopen("sparse-images-1024-1.tsv", "rb"); if (fp2 == NULL) { printf("Error:Open file fail!\n"); } fseek(fp2, 0, SEEK_END); size2 = ftell(fp2); rewind(fp2); tc2 = (int*)malloc(sizeof(int) * size2/4); int readnum2 = fread(tc2, 4, size2/4, fp2); fclose(fp2); FILE* fp1; fp1 = fopen("neuron1024-l120-categories.tsv", "rb"); if (fp1 == NULL) { printf("Error:Open file fail!\n"); } fseek(fp1, 0, SEEK_END); size1 = ftell(fp1); rewind(fp1); tc1 = (int*)malloc(sizeof(int) * size1/4); int readnum1 = fread(tc1, 4, size1/4, fp1); fclose(fp1); int judge=0; for(int i=0;i<size1/4;i++) { if(tc1[i]-tc2[i] != 0) { judge++; } } printf("judge:%d\n",judge); if (judge == 0) { printf("CHALLENGE PASSED\n"); } else { printf("CHALLENGE FAILED\n"); } free(A0); return 0; }
该段代码实现了上一节中所述的可优化的部分,下面为运行结果截图:
6.7s!对于内存访问的优化竟然相比于原本方法快了10s,除去后面要说的relu+bias的时间也就是快了9s左右。很不可思议哎!
bias+relu的优化
这bias和relu就是对矩阵中的每一项家偏置项,然后使用relu激活,很明显可以使用kernel函数通过每个线程对其进行并行处理。以达到加速的目的。
这里使用二维网格与线程块来进行计算。行索引 row 通过 blockIdx.y * blockDim.y + threadIdx.y 得到,blockIdx.y 是线程块的横向索引,blockDim.y 是线程块横向的线程数,threadIdx.y 是线程在某块内的横向索引。col 的计算相似。
kernel函数如下:
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid(ceil((float)nC / BLOCK_SIZE) , ceil((float)mC / BLOCK_SIZE) ); __global__ void RELUKernel(VALUE_TYPE *C,int n) { VALUE_TYPE Cvalue; int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; C[row * n + col]=C[row * n + col]-0.3000; Cvalue=C[row * n + col]; if (Cvalue<=0){ C[row * n + col]=0.0;} else if(Cvalue>=32){ C[row * n + col]=32.0; } }
for循环的时间:
kernel的时间:
提升了约36000倍!
结论
最后优化的时间为 6.7s 左右。