cuDNN API的使用与测试-以二维卷积+Relu激活函数为例¶
本文写于2022年7月23日
一、cudnn介绍¶
NVIDIA CUDA深度神经网络库(cuDNN)是适用于英伟达GPU的用于深度神经网络的cuda函数库。cuDNN为神经网络中常见的算子提供了高度优化的实现,例如卷积,池化,归一化层和激活层的前向传播和反向传播的实现。 全球的深度学习研究人员和框架开发人员都依赖cuDNN来实现高性能GPU加速。它使他们可以专注于训练神经网络和开发软件应用程序,而不必花时间在底层GPU性能调整上。cuDNN广泛应用于深度学习框架上,包括Caffe2,Chainer,Keras,MATLAB,MxNet,PyTorch和TensorFlow。
原版的cuda安装包是不包含cudnn的,需要用户额外下载和安装。cudnn的安装过程不再文本内容的范围以内,还请读者仔细搜索下载安装。本文的例子是基于cudnn8.0版本,demo地址为:
https://github.com/thb1314/cudnn_conv_relu
更多学习资料请参考:
https://docs.nvidia.com/deeplearning/cudnn/
二、cudnn API的使用步骤¶
使用 cuDNN 的应用程序必须通过调用 cudnnCreate()
来初始化库上下文的句柄。这个句柄被显式地传递给对 GPU 数据进行操作的每个库函数。一旦应用程序完成使用 cuDNN
,它可以使用 cudnnDestroy()
释放与库句柄关联的资源。这种方法允许用户在使用多个host主机线程(比如cpu线程)、GPU 或者 CUDA stream时显式控制库的功能。
cudnn API使用步骤一般如下:
-
创建cuDNN句柄
cudnnStatus_t cudnnCreate(cudnnHandle_t *handle)
-
以Host方式调用在Device上运行的函数
比如卷积运算:cudnnConvolutionForward
等 -
释放cuDNN句柄
cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)
其他函数 比如在创建cudnn的时候可以设置或者互殴cudnn所用的streamcudnnStatus_t cudnnSetStream( cudnnHandle_t handle, cudaStream_t streamId)
cudnnStatus_t cudnnGetStream( cudnnHandle_t handle, cudaStream_t *streamId)
三、二维卷积的实现案例¶
本小节介绍如何使用cuDNN的卷积相关api来实现二维卷积运算,本文还是会使用在cublas章节采用的一个张量管理的封装类。具体步骤如下:
- 创建卷积所需要的输入Tensor,卷积权重和卷积偏执项。
- 创建cudnn handle并且设置handle的stream
- 设置卷积输入相关Tensor的形状描述符(使用TensorDescriptor系列API)和卷积配置相关的描述符
- 根据当前配合搜索卷积计算方法并申请相关workspace
- 调用
cudnnConvolutionBiasActivationForward
完成卷积计算 - 释放workspace显存占用,销毁cudnn 句柄
核心代码如下:
// 123
int test_conv_relu() {
::srand(::time(0));
std::cout << "CUDNN_VERSION:" << CUDNN_VERSION << std::endl;
// 设定输入输出tensor的维度参数
constexpr int batch_size = 4;
constexpr int channel_in = 3;
constexpr int height_in = 112;
constexpr int width_in = 112;
constexpr int channel_out = 15;
constexpr int height_out = 112;
constexpr int width_out = 112;
constexpr int kernel_h = 1;
constexpr int kernel_w = 1;
// 构造相关Tensor
// input
TRT::Tensor q_tensor(std::vector<int>{batch_size, channel_in, height_in, width_in});
// kernel input
TRT::Tensor kernel_tensor(std::vector<int>{channel_out, channel_in, kernel_h, kernel_w});
// bias
TRT::Tensor bias_tensor(std::vector<int>{channel_out});
TRT::Tensor z_tensor(std::vector<int>{batch_size, channel_out, height_out, width_out});
// output
TRT::Tensor out_tensor(std::vector<int>{batch_size, channel_out, height_out, width_out});
auto qptr_cpu = q_tensor.cpu<float>();
for(int i = 0; i < q_tensor.numel(); ++i)
{
qptr_cpu[i] = float(rand() % 100000) / 100000;
}
q_tensor.save_to_file("q_tensor.npz");
auto biasptr_cpu = bias_tensor.cpu<float>();
for(int i = 0; i < bias_tensor.numel(); ++i)
{
biasptr_cpu[i] = float(rand() % 100000) / 100000;
}
bias_tensor.save_to_file("bias_tensor.npz");
auto kernelptr_cpu = kernel_tensor.cpu<float>();
for(int i = 0; i < kernel_tensor.numel(); ++i)
{
kernelptr_cpu[i] = float(rand() % 100000) / 100000;
}
kernel_tensor.save_to_file("kernel_tensor.npz");
auto qptr_gpu = q_tensor.to_gpu(true).gpu<float>();
auto bias_gpu = bias_tensor.to_gpu(true).gpu<float>();
auto kernel_gpu = kernel_tensor.to_gpu(true).gpu<float>();
auto outptr_gpu = out_tensor.to_gpu().gpu<float>();
cudaStream_t stream = out_tensor.get_stream();
// 创建cudnn句柄并设置handle的stream
cudnnHandle_t cudnn;
checkCUDNN(cudnnCreate(&cudnn));
checkCUDNN(cudnnSetStream(cudnn, stream));
// y = act ( alpha1 * conv(x) + alpha2 * z + bias )
const float alpha1 = 1;
const float alpha2 = 0;
// 设置输入Tensor描述符
cudnnTensorDescriptor_t input_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/batch_size,
/*channels=*/channel_in,
/*image_height=*/height_in,
/*image_width=*/width_in));
// 设置输出Tensor描述符
cudnnTensorDescriptor_t output_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/batch_size,
/*channels=*/channel_out,
/*image_height=*/height_out,
/*image_width=*/width_out));
// 设置bias描述符
cudnnTensorDescriptor_t bias_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&bias_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(bias_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/channel_out,
/*image_height=*/1,
/*image_width=*/1));
// 设置z描述符
// // y = act ( alpha1 * conv(x) + alpha2 * z + bias ) 这里用不到
cudnnTensorDescriptor_t z_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&z_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(z_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/batch_size,
/*channels=*/channel_out,
/*image_height=*/height_out,
/*image_width=*/width_out));
// 设置conv weight的描述
cudnnFilterDescriptor_t kernel_descriptor;
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
/*dataType=*/CUDNN_DATA_FLOAT,
/*format=*/CUDNN_TENSOR_NCHW,
/*out_channels=*/channel_out,
/*in_channels=*/channel_in,
/*kernel_height=*/kernel_h,
/*kernel_width=*/kernel_w));
// 设置卷积相关参数
cudnnConvolutionDescriptor_t convolution_descriptor;
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
/*pad_height=*/0,
/*pad_width=*/0,
/*vertical_stride=*/1,
/*horizontal_stride=*/1,
/*dilation_height=*/1,
/*dilation_width=*/1,
/*mode=*/CUDNN_CROSS_CORRELATION,
/*computeType=*/CUDNN_DATA_FLOAT));
// 设置激活层相关参数
cudnnActivationDescriptor_t activation_descriptor;
checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
/*mode=*/CUDNN_ACTIVATION_RELU,
/*reluNanOpt=*/CUDNN_PROPAGATE_NAN,
/*relu_coef=*/0));
// 获取卷积计算算法相关参数和workspace
int cnt = 0;
cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn, &cnt);
std::cout << "cnt: " << cnt << std::endl;
cudnnConvolutionFwdAlgoPerf_t convolution_algorithm;
int ret_cnt = 0;
checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
1,
&ret_cnt,
&convolution_algorithm));
size_t workspace_bytes = 0;
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
convolution_algorithm.algo,
&workspace_bytes));
void* d_workspace{nullptr};
cudaMalloc(&d_workspace, workspace_bytes);
// 执行卷积运算
checkCUDNN(cudnnConvolutionBiasActivationForward(
cudnn, &alpha1, input_descriptor, qptr_gpu, kernel_descriptor, kernel_gpu,
convolution_descriptor, convolution_algorithm.algo, d_workspace, workspace_bytes,
&alpha2, z_descriptor, outptr_gpu,
bias_descriptor, bias_gpu, activation_descriptor, output_descriptor, outptr_gpu));
out_tensor.to_cpu(true);
out_tensor.save_to_file("out_tensor.npz");
// 销毁描述符和句柄
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(z_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyTensorDescriptor(bias_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
cudnnDestroy(cudnn);
cudaFree(d_workspace);
return 0;
}
四、验证环节¶
验证环节我们可以采用pytorch api来验证cudnn的实现是否能与cpu上torch的计算结果对齐。主要还是用到npy文件来保存,相关py代码如下:
import numpy as np
import torch.nn.functional as F
import torch
def _load_tensor(file):
with open(file, "rb") as f:
binary_data = f.read()
magic_number, ndims, dtype = np.frombuffer(binary_data, np.uint32, count=3, offset=0)
assert magic_number == 0xFCCFE2E2, f"{file} not a tensor file."
dims = np.frombuffer(binary_data, np.uint32, count=ndims, offset=3 * 4)
if dtype == 0:
np_dtype = np.float32
elif dtype == 1:
np_dtype = np.float16
else:
assert False, f"Unsupport dtype = {dtype}, can not convert to numpy dtype"
return np.frombuffer(binary_data, np_dtype, offset=(ndims + 3) * 4).reshape(*dims)
def load_tensor(file):
if file.endswith("npz"):
return np.load(file)['data']
elif file.endswith("npy"):
return np.load(file)
else:
return _load_tensor(file)
def test():
input_tensor = load_tensor('q_tensor.npz')
weight_tensor = load_tensor('kernel_tensor.npz')
bias_tensor = load_tensor('bias_tensor.npz')
out_tensor = load_tensor('out_tensor.npz')
input_tensor = torch.as_tensor(input_tensor).float()
weight_tensor = torch.as_tensor(weight_tensor).float()
bias_tensor = torch.as_tensor(bias_tensor).float()
out_tensor = torch.as_tensor(out_tensor).float()
print(input_tensor.shape)
print(weight_tensor.shape)
print(bias_tensor.shape)
print(out_tensor.shape)
# 关键方法 F.conv2d
out_tensor_torch = F.conv2d(input_tensor, weight_tensor, bias_tensor, stride=1)
out_tensor_torch = F.relu(out_tensor_torch)
print(torch.abs(out_tensor_torch - out_tensor).max())
if __name__ == "__main__":
test()
五、总结¶
本文以二维卷积运算为例简单介绍了cudnn API的使用与验证。cuDNN为cuda开发者提供便利,一些专业的cuda开发者在写cuda算子的时候常常做的一件事就是先看看cudnn上有没有实现,并且验证自己的实现与cudnn的实现相比哪个更优。在日常的cuda开发中,对cudnn的熟悉是必不可少的环节,官网的例子较为丰富,遇到哪个api不会用的时候直接在github上搜索就可以找到用例。