// 123
int test_conv_relu() {
::srand(::time(0));
std::cout << "CUDNN_VERSION:" << CUDNN_VERSION << std::endl;
// 设定输入输出tensor的维度参数
constexpr int batch_size = 4;
constexpr int channel_in = 3;
constexpr int height_in = 112;
constexpr int width_in = 112;
constexpr int channel_out = 15;
constexpr int height_out = 112;
constexpr int width_out = 112;
constexpr int kernel_h = 1;
constexpr int kernel_w = 1;
// 构造相关Tensor
// input
TRT::Tensor q_tensor(std::vector<int>{batch_size, channel_in, height_in, width_in});
// kernel input
TRT::Tensor kernel_tensor(std::vector<int>{channel_out, channel_in, kernel_h, kernel_w});
// bias
TRT::Tensor bias_tensor(std::vector<int>{channel_out});
TRT::Tensor z_tensor(std::vector<int>{batch_size, channel_out, height_out, width_out});
// output
TRT::Tensor out_tensor(std::vector<int>{batch_size, channel_out, height_out, width_out});
auto qptr_cpu = q_tensor.cpu<float>();
for(int i = 0; i < q_tensor.numel(); ++i)
{
qptr_cpu[i] = float(rand() % 100000) / 100000;
}
q_tensor.save_to_file("q_tensor.npz");
auto biasptr_cpu = bias_tensor.cpu<float>();
for(int i = 0; i < bias_tensor.numel(); ++i)
{
biasptr_cpu[i] = float(rand() % 100000) / 100000;
}
bias_tensor.save_to_file("bias_tensor.npz");
auto kernelptr_cpu = kernel_tensor.cpu<float>();
for(int i = 0; i < kernel_tensor.numel(); ++i)
{
kernelptr_cpu[i] = float(rand() % 100000) / 100000;
}
kernel_tensor.save_to_file("kernel_tensor.npz");
auto qptr_gpu = q_tensor.to_gpu(true).gpu<float>();
auto bias_gpu = bias_tensor.to_gpu(true).gpu<float>();
auto kernel_gpu = kernel_tensor.to_gpu(true).gpu<float>();
auto outptr_gpu = out_tensor.to_gpu().gpu<float>();
cudaStream_t stream = out_tensor.get_stream();
// 创建cudnn句柄并设置handle的stream
cudnnHandle_t cudnn;
checkCUDNN(cudnnCreate(&cudnn));
checkCUDNN(cudnnSetStream(cudnn, stream));
// y = act ( alpha1 * conv(x) + alpha2 * z + bias )
const float alpha1 = 1;
const float alpha2 = 0;
// 设置输入Tensor描述符
cudnnTensorDescriptor_t input_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/batch_size,
/*channels=*/channel_in,
/*image_height=*/height_in,
/*image_width=*/width_in));
// 设置输出Tensor描述符
cudnnTensorDescriptor_t output_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/batch_size,
/*channels=*/channel_out,
/*image_height=*/height_out,
/*image_width=*/width_out));
// 设置bias描述符
cudnnTensorDescriptor_t bias_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&bias_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(bias_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/channel_out,
/*image_height=*/1,
/*image_width=*/1));
// 设置z描述符
// // y = act ( alpha1 * conv(x) + alpha2 * z + bias ) 这里用不到
cudnnTensorDescriptor_t z_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&z_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(z_descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/batch_size,
/*channels=*/channel_out,
/*image_height=*/height_out,
/*image_width=*/width_out));
// 设置conv weight的描述
cudnnFilterDescriptor_t kernel_descriptor;
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
/*dataType=*/CUDNN_DATA_FLOAT,
/*format=*/CUDNN_TENSOR_NCHW,
/*out_channels=*/channel_out,
/*in_channels=*/channel_in,
/*kernel_height=*/kernel_h,
/*kernel_width=*/kernel_w));
// 设置卷积相关参数
cudnnConvolutionDescriptor_t convolution_descriptor;
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
/*pad_height=*/0,
/*pad_width=*/0,
/*vertical_stride=*/1,
/*horizontal_stride=*/1,
/*dilation_height=*/1,
/*dilation_width=*/1,
/*mode=*/CUDNN_CROSS_CORRELATION,
/*computeType=*/CUDNN_DATA_FLOAT));
// 设置激活层相关参数
cudnnActivationDescriptor_t activation_descriptor;
checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
/*mode=*/CUDNN_ACTIVATION_RELU,
/*reluNanOpt=*/CUDNN_PROPAGATE_NAN,
/*relu_coef=*/0));
// 获取卷积计算算法相关参数和workspace
int cnt = 0;
cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn, &cnt);
std::cout << "cnt: " << cnt << std::endl;
cudnnConvolutionFwdAlgoPerf_t convolution_algorithm;
int ret_cnt = 0;
checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
1,
&ret_cnt,
&convolution_algorithm));
size_t workspace_bytes = 0;
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
convolution_algorithm.algo,
&workspace_bytes));
void* d_workspace{nullptr};
cudaMalloc(&d_workspace, workspace_bytes);
// 执行卷积运算
checkCUDNN(cudnnConvolutionBiasActivationForward(
cudnn, &alpha1, input_descriptor, qptr_gpu, kernel_descriptor, kernel_gpu,
convolution_descriptor, convolution_algorithm.algo, d_workspace, workspace_bytes,
&alpha2, z_descriptor, outptr_gpu,
bias_descriptor, bias_gpu, activation_descriptor, output_descriptor, outptr_gpu));
out_tensor.to_cpu(true);
out_tensor.save_to_file("out_tensor.npz");
// 销毁描述符和句柄
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(z_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyTensorDescriptor(bias_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
cudnnDestroy(cudnn);
cudaFree(d_workspace);
return 0;
}