使用GPU处理图像的速度比使用CPU处理图像的速度要快很多,下面是使用GPU与使用CPU做图像处理速度的对比,对同一个图像做简单的黑色的混色。
可以看出使用GPU处理大约需要耗时10ms左右,而使用CPU处理大约需要耗时50毫秒左右,CUDA处理图像的速度大约是CPU处理的5倍。
这里使用的1维的纹理内存,创建并绑定纹理内存的步骤如下:
texture<unsigned char, cudaTextureType1D, cudaReadModeElementType> rT;
textureReference *texRefPtr = nullptr;
cudaGetTextureReference((const textureReference**)&texRefPtr, &rT);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
cudaBindTexture(0, &rT, pDevSrc, &channelDesc, size);
其中,pDevSrc为设备内存指针,size为内存的大小。
完整代码如下:
界面显示相关,CUDAMainWindow.h
#ifndef CUDA_MAINWINDOW_H
#define CUDA_MAINWINDOW_H
#include "UIBase/UIBaseWindow.h"
#include <QWidget>
#include <QLabel>
#include <QSlider>
class CUDAMainWindow : public UIBaseWindow
{
Q_OBJECT
public:
CUDAMainWindow(QWidget* parent = nullptr);
~CUDAMainWindow();
private:
QLabel *m_LeftImage = nullptr;
QLabel *m_LeftTag = nullptr;
QLabel *m_RightImage = nullptr;
QLabel *m_RightTag = nullptr;
QSlider *m_Slider = nullptr;
QImage m_MainImage;
unsigned char* pCUDAImageData = nullptr;
unsigned char* pCPUImageData = nullptr;
// 使用CUDA修改透明度
QImage cudaBlendImage(float alpha);
// 使用CPU修改透明度
QImage normalBlendImage(float alpha);
private slots:
void onSliderValueChanged(int);
};
#endif
界面显示相关,CUDAMainWindow.cpp
#include "CUDAMainWindow.h"
#include <QVBoxLayout>
#include <QHBoxLayout>
#include <QTime>
#include <QDebug>
#include "CUDACore/CUDAImageDispose.cuh"
CUDAMainWindow::CUDAMainWindow(QWidget* parent)
:UIBaseWindow(parent)
{
QVBoxLayout *mainLayout = new QVBoxLayout(this);
mainLayout->addSpacing(30);
QHBoxLayout *topLayout = new QHBoxLayout;
m_LeftImage = new QLabel;
// Image Display
QImage image("./test.jpg");
m_MainImage = image;
qreal scaleValue = image.width() * 1.0 / 400;
image = image.scaled(image.width() * 1.0 / scaleValue, image.height() * 1.0 / scaleValue);
m_LeftImage->setPixmap(QPixmap::fromImage(image));
m_RightImage = new QLabel;
m_RightImage->setPixmap(QPixmap::fromImage(image));
topLayout->addWidget(m_LeftImage);
topLayout->addWidget(m_RightImage);
// Speed Display
m_LeftTag = new QLabel("CUDA Delay: ");
m_RightTag = new QLabel("CPU Delay: ");
QHBoxLayout *midLayout = new QHBoxLayout;
midLayout->addWidget(m_LeftTag);
midLayout->addWidget(m_RightTag);
// Slider Display
m_Slider = new QSlider(Qt::Horizontal);
m_Slider->setMaximum(100);
m_Slider->setMinimum(0);
m_Slider->setValue(100);
QObject::connect(m_Slider, SIGNAL(valueChanged(int)), this, SLOT(onSliderValueChanged(int)));
mainLayout->addLayout(topLayout);
mainLayout->addLayout(midLayout);
mainLayout->addStretch();
mainLayout->addWidget(m_Slider);
}
CUDAMainWindow::~CUDAMainWindow()
{
}
// 使用CUDA修改透明度
QImage CUDAMainWindow::cudaBlendImage(float alpha)
{
if (pCUDAImageData == nullptr)
pCUDAImageData = new unsigned char[m_MainImage.byteCount()];
QTime time;
time.start();
blendCudaImage(pCUDAImageData, (unsigned char*)m_MainImage.constBits(), m_MainImage.byteCount(), alpha);
// 计算时间
QString str = "CUDA Delay: %1 ms";
str = str.arg(time.elapsed());
m_LeftTag->setText(str);
QImage::Format format = m_MainImage.format();
QImage image(pCUDAImageData, m_MainImage.width(), m_MainImage.height(), format);
return image;
}
// 使用CPU修改透明度
QImage CUDAMainWindow::normalBlendImage(float alpha)
{
if (pCPUImageData == nullptr)
pCPUImageData = new unsigned char[m_MainImage.byteCount()];
QTime time;
time.start();
for (int i = 0; i < m_MainImage.byteCount(); ++i)
pCPUImageData[i] = m_MainImage.constBits()[i] * alpha;
// 计算时间
QString str = "CPU Delay: %1 ms";
str = str.arg(time.elapsed());
m_RightTag->setText(str);
QImage::Format format = m_MainImage.format();
QImage image(pCPUImageData, m_MainImage.width(), m_MainImage.height(), format);
return image;
}
void CUDAMainWindow::onSliderValueChanged(int value)
{
qreal alpha = value * 1.0 / 100;
QImage image1 = cudaBlendImage(alpha);
qreal scaleValue = image1.width() * 1.0 / 400;
image1 = image1.scaled(image1.width() * 1.0 / scaleValue, image1.height() * 1.0 / scaleValue);
m_LeftImage->setPixmap(QPixmap::fromImage(image1));
QImage image2 = normalBlendImage(alpha);
scaleValue = image2.width() * 1.0 / 400;
image2 = image1.scaled(image2.width() * 1.0 / scaleValue, image2.height() * 1.0 / scaleValue);
m_RightImage->setPixmap(QPixmap::fromImage(image2));
}
CUDA处理,CUDAImageDispose.cuh
#ifndef CUDA_IMAGE_DISPOSE_H
#define CUDA_IMAGE_DISPOSE_H
#include "cuda_runtime.h"
#include "stdio.h"
#include "device_launch_parameters.h"
extern "C" void blendCudaImage(unsigned char* pDest, unsigned char* pSrc, int size, float alpha);
#endif
CUDA处理,CUDAImageDispose.cu
#include "CUDACore/CUDAImageDispose.cuh"
#define BLOCK_DIM 512
texture<unsigned char, cudaTextureType1D, cudaReadModeElementType> rT;
__global__ void blendTexture(unsigned char* pDest, int size, float alpha)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size)
pDest[index] = tex1Dfetch(rT, index) * alpha;
}
void blendTextureFunction(unsigned char* pDest, unsigned char* pSrc, int size, float alpha)
{
unsigned char *pDevSrc = nullptr;
cudaMalloc(&pDevSrc, size);
cudaMemcpy(pDevSrc, pSrc, size, cudaMemcpyHostToDevice);
unsigned char *pDevDest = nullptr;
cudaMalloc(&pDevDest, size);
textureReference *texRefPtr = nullptr;
cudaGetTextureReference((const textureReference**)&texRefPtr, &rT);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
cudaBindTexture(0, &rT, pDevSrc, &channelDesc, size);
blendTexture << <ceil((float)size / BLOCK_DIM), BLOCK_DIM >> >(pDevDest, size, alpha);
cudaThreadSynchronize();
cudaMemcpy(pDest, pDevDest, size, cudaMemcpyDeviceToHost);
cudaUnbindTexture(rT);
cudaFree(pDevSrc);
cudaFree(pDevDest);
}
void blendCudaImage(unsigned char* pDest, unsigned char* pSrc, int size, float alpha)
{
blendTextureFunction(pDest, pSrc, size, alpha);
}