pytorch利用tensorrt加速
文章目录
构件enigen
环境
ubuntu 16.04
tensorrt 7.0.0.11
cuda 10.0
#include <cuda_runtime.h>
#include <iostream>
#include <string>
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 2;
samplesCommon::Args gArgs;
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
// static Logger
// gLogger; // 创建全局变量Logger,作为TensorRT各种调用的方法的输入参数
const std::string gSampleName = "TensorRT.sample_onnx_image";
// bool OnnxToTRTModel(const std::string& )
int main() {
Logger gLogger;
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch =
1U << static_cast<uint32_t>(
nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
// 创建模型容器
std::cout << "创建模型容器" << std::endl;
nvinfer1::INetworkDefinition* network =
builder->createNetworkV2(explicitBatch);
// 开始填充模型
std::cout << "开始填充模型" << std::endl;
auto parser = nvonnxparser::createParser(*network, gLogger);
const char* onnx_filename = "../gpu.onnx";
parser->parseFromFile(onnx_filename,
static_cast<int>(Logger::Severity::kWARNING));
// 开始构建enigen
std::cout << "开始构建enigen" << std::endl;
builder->setMaxBatchSize(1); // 设置batch size
// builder->setMaxWorkspaceSize(1600 * (1 << 20)); // 最大占用显存1600M
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
config->setMaxWorkspaceSize(160 * (1 << 20));
config->setFlag(nvinfer1::BuilderFlag::kFP16);
std::cout << "Building engine, please wait for a while..." << std::endl;
nvinfer1::ICudaEngine* enigen =
builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;
// 序列化模型保存
std::cout << "序列化模型保存" << std::endl;
nvinfer1::IHostMemory* giemodelstream = enigen->serialize();
std::string serialize_str;
std::ofstream serialize_output_stream;
serialize_str.resize(giemodelstream->size());
// memcpy((void*)serialize_str.data(), giemodelstream->data(),
// giemodelstream->size());
serialize_output_stream.open("./serialize_engine_output.trt",
std::ios::binary | std::ios::out);
serialize_output_stream.write(
reinterpret_cast<const char*>(giemodelstream->data()),
giemodelstream->size());
std::cout << "writing engine file..." << std::endl;
// serialize_output_stream << serialize_str;
serialize_output_stream.close();
}
部署模型进行推理
#include <cuda_runtime.h>
#include <chrono>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <string>
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"
#define DEVICE 0 // GPU id
#define BATCH_SIZE 1
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int CLASS_NUM = 2;
// static const int OUTPUT_SIZE =
// Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) +
// 1;
static const int OUTPUT_SIZE = 2;
void doInference(IExecutionContext& context, cudaStream_t& stream,
void** buffers, float* input, float* output, int batchSize) {
// DMA input batch data to device, infer on the batch asynchronously, and DMA
// output back to host
cudaMemcpyAsync(buffers[0], input,
batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
cudaMemcpyHostToDevice, stream);
context.enqueue(batchSize, buffers, stream, nullptr);
cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
}
cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
int w, h, x, y;
float r_w = input_w / (img.cols * 1.0);
float r_h = input_h / (img.rows * 1.0);
if (r_h > r_w) {
w = input_w;
h = r_w * img.rows;
x = 0;
y = (input_h - h) / 2;
} else {
w = r_h * img.cols;
h = input_h;
x = (input_w - w) / 2;
y = 0;
}
cv::Mat re(h, w, CV_8UC3);
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
return out;
}
int main() {
Logger gLogger;
std::string engine_name = "serialize_engine_output.trt"; // load engine name
// start load engine
char* trtModelStream{nullptr};
size_t size{0};
std::ifstream engine_file(engine_name, std::ios::binary);
if (engine_file.good()) {
engine_file.seekg(0,
engine_file.end); // 定位输入流结束位置地址偏移量为0初
size = engine_file.tellg();
engine_file.seekg(0, engine_file.beg);
trtModelStream = new char[size];
engine_file.read(trtModelStream, size);
engine_file.close();
}
static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
static float prob[BATCH_SIZE * OUTPUT_SIZE];
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
IExecutionContext* context = engine->createExecutionContext();
delete[] trtModelStream;
void* buffers[2];
const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
// 使用上面的indices,在GPU上创建一个指向input和output缓冲区的buffer数组
// Create GPU buffers on device
cudaMalloc(&buffers[inputIndex],
BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float));
cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
// Create stream
cudaStream_t stream;
cudaStreamCreate(&stream);
//
cv::Mat img = cv::imread("../3.jpg");
cv::Mat pre_img = preprocess_img(img, INPUT_W, INPUT_H);
auto start = std::chrono::system_clock::now();
int i = 0;
int fcount = 0;
// 分离BGR并变化成RGB
for (int row = 0; row < INPUT_H; ++row) {
uchar* uc_pixel = pre_img.data + row * pre_img.step;
for (int col = 0; col < INPUT_W; ++col) {
data[fcount * 3 * INPUT_H * INPUT_W + i] =
static_cast<float>(uc_pixel[2]) / 255.0;
data[fcount * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] =
static_cast<float>(uc_pixel[1]) / 255.0;
data[fcount * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] =
static_cast<float>(uc_pixel[0]) / 255.0;
uc_pixel += 3;
++i;
}
}
// Run inference
doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
auto end = std::chrono::system_clock::now();
std::cout << "prob: " << std::endl;
std::cout << prob[0] << " " << prob[1] << std::endl;
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
start)
.count()
<< "ms" << std::endl;
}
结果展示
prob:
-9.97636 10.7392
8ms
遇到的坑
[TensorRT] ERROR: Network must have at least one output
原因是你的pytorch 版本太高
pytorch版本为1.4.0,pytorch版本过高引起的onnx解析问题.(据悉这个解析问题会发生在trt5.0-6.0,trt7.0不会出现,详见trt6转torch1.2以上版本的onnx)
改用tensorrt7.0.0.11
"ONNX parser only supports networks with an explicit batch dimension"
解决方法
nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);