How to quickly find a image in another image using CUDA?
In my current project I need to find pixel exact position of image contained in another image of larger size. Smaller image is never rotated or stretched (so should match pixel by pixel) but it may have different brightness and some pixels in the image may be distorted. My first attemp was to do it on CPU but it was too slow. The calculations are very parallel, so I decided to use the GPU. I just started to learn CUDA and wrote my first CUDA app. My code works but it still is too slow even on GPU. When the larger image has a dimension of 1024x1280 and smaller is 128x128 program performs calculations in 2000ms on GeForce GTX 560 ti. I need to get results in less than 200ms. In the future I'll probably need a more complex algorithm, so I'd rather have even more computational power reserve. The question is how I can optimise my code to achieve that speed up?
CUDAImageLib.dll:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cutil.h>
//#define SUPPORT_ALPHA
__global__ void ImageSearch_kernel(float* BufferOut, float* BufferB, float* BufferS, unsigned int bw, unsigned int bh, unsigned int sw, unsigned int sh)
{
unsigned int bx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int by = threadIdx.y + blockIdx.y * blockDim.y;
float diff = 0;
for (unsigned int y = 0; y < sh; ++y)
{
for (unsigned int x = 0; x < sw; ++x)
{
unsigned int as = (x + y * sw) * 4;
unsigned int ab = (x + bx + (y + by) * bw) * 4;
#ifdef SUPPORT_ALPHA
diff += ((abs(BufferS[as] - BufferB[ab]) + abs(BufferS[as + 1] - BufferB[ab + 1]) + abs(BufferS[as + 2] - BufferB[ab + 2])) * BufferS[as + 3] * BufferB[ab + 3]);
#else
diff += abs(BufferS[as] - BufferB[ab]);
diff += abs(BufferS[as + 1] - BufferB[ab + 1]);
diff += abs(BufferS[as + 2] - BufferB[ab + 2]);
#endif
}
}
BufferOut[bx + (by * (bw - sw))] = diff;
}
extern "C" int __declspec(dllexport) __stdcall ImageSearchGPU(float* BufferOut, float* BufferB, float* BufferS, int bw, int bh, int sw, int sh)
{
int aBytes = (bw * bh) * 4 * sizeof(float);
int bBytes = (sw * sh) * 4 * sizeof(float);
int cBytes = ((bw - sw) * (bh - sh)) * sizeof(float);
dim3 threadsPerBlock(32, 32);
dim3 numBlocks((bw - sw) / threadsPerBlock.x, (bh - sh) / threadsPerBlock.y);
float *dev_B = 0;
float *dev_S = 0;
float *dev_Out = 0;
unsigned int timer = 0;
float sExecutionTime = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_Out, cBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_B, aBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_S, bBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_B, BufferB, aBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_S, BufferS, bBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cutCreateTimer(&timer);
cutStartTimer(timer);
// Launch a kernel on the GPU with one thread for each element.
ImageSearch_kernel<<<numBlocks, threadsPerBlock>>>(dev_Out, dev_B, dev_S, bw, bh, sw, sh);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cutStopTimer(timer);
sExecutionTime = cutGetTimerValue(timer);
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(BufferOut, dev_Out, cBytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_Out);
cudaFree(dev_B);
cudaFree(dev_S);
return (int)sExecutionTime;
}
extern "C" int __declspec(dllexport) __stdcall FindMinCPU(float* values, int count)
{
int minIndex = 0;
float minValue = 3.4e+38F;
for (int i = 0; i < count; ++i)
{
if (values[i] < minValue)
{
minValue = values[i];
minIndex = i;
}
}
return minIndex;
}
C# test app:
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Drawing;
namespace TestCUDAImageSearch
{
class Program
{
static void Main(string[] args)
{
using(Bitmap big = new Bitmap("Big.png"), small = new Bitmap("Small.png"))
{
Console.WriteLine("Big " + big.Width + "x" + big.Height + " Small " + small.Width + "x" + small.Height);
Stopwatch sw = new Stopwatch();
sw.Start();
Point point = CUDAImageLIb.ImageSearch(big, small);
sw.Stop();
long t = sw.ElapsedMilliseconds;
Console.WriteLine("Image found at " + point.X + "x" + point.Y);
Console.WriteLine("total time=" + t + "ms kernel time=" + CUDAImageLIb.LastKernelTime + "ms");
}
Console.WriteLine("Hit key");
Console.ReadKey();
}
}
}
//#define SUPPORT_HSB
using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;
namespace TestCUDAImageSearch
{
public static class CUDAImageLIb
{
[DllImport("CUDAImageLib.dll")]
private static extern int ImageSearchGPU(float[] bufferOut, float[] bufferB, float[] bufferS, int bw, int bh, int sw, int sh);
[DllImport("CUDAImageLib.dll")]
private static extern int FindMinCPU(float[] values, int count);
private static int _lastKernelTime = 0;
public static int LastKernelTime
{
get { return _lastKernelTime; }
}
public static Point ImageSearch(Bitmap big, Bitmap small)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
float[] diffs = new float[mx * my];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
开发者_开发技巧 _lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
int minIndex = FindMinCPU(diffs, diffs.Length);
return new Point(minIndex % mx, minIndex / mx);
}
public static List<Point> ImageSearch(Bitmap big, Bitmap small, float maxDeviation)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
int nDiff = mx * my;
float[] diffs = new float[nDiff];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
List<Point> points = new List<Point>();
for(int i = 0; i < nDiff; ++i)
{
if (diffs[i] < maxDeviation)
{
points.Add(new Point(i % mx, i / mx));
}
}
return points;
}
#if SUPPORT_HSB
private static float[] ImageToFloat(Bitmap img)
{
int w = img.Width;
int h = img.Height;
float[] pix = new float[w * h * 4];
int i = 0;
for (int y = 0; y < h; ++y)
{
for (int x = 0; x < w; ++x)
{
Color c = img.GetPixel(x, y);
pix[i] = c.GetHue() / 360;
pix[i + 1] = c.GetSaturation();
pix[i + 2] = c.GetBrightness();
pix[i + 3] = c.A;
i += 4;
}
}
return pix;
}
#else
private static float[] ImageToFloat(Bitmap bmp)
{
int w = bmp.Width;
int h = bmp.Height;
int n = w * h;
float[] pix = new float[n * 4];
System.Diagnostics.Debug.Assert(bmp.PixelFormat == PixelFormat.Format32bppArgb);
Rectangle r = new Rectangle(0, 0, w, h);
BitmapData bmpData = bmp.LockBits(r, ImageLockMode.ReadOnly, bmp.PixelFormat);
System.Diagnostics.Debug.Assert(bmpData.Stride > 0);
int[] pixels = new int[n];
System.Runtime.InteropServices.Marshal.Copy(bmpData.Scan0, pixels, 0, n);
bmp.UnlockBits(bmpData);
int j = 0;
for (int i = 0; i < n; ++i)
{
pix[j] = (pixels[i] & 255) / 255.0f;
pix[j + 1] = ((pixels[i] >> 8) & 255) / 255.0f;
pix[j + 2] = ((pixels[i] >> 16) & 255) / 255.0f;
pix[j + 3] = ((pixels[i] >> 24) & 255) / 255.0f;
j += 4;
}
return pix;
}
#endif
}
}
Looks like what you are talking about is a well known problem: Template matching. The easiest way forward is to convolve the Image (the bigger image) with the template (the smaller image). You could implement convolutions in one of two ways.
1) Modify the convolutions example from the CUDA SDK (similar to what you are doing anyway).
2) Use FFTs to implement the convolution. Ref. Convolution theorem. You will need to remember
% MATLAB format
L = size(A) + size(B) - 1;
conv2(A, B) = IFFT2(FFT2(A, L) .* FFT2(B, L));
You could use cufft to implement the 2 dimensional FFTs (After padding them appropriately). You will need to write a kernel that does element wise multiplication and then normalizes the result (because CUFFT does not normalize) before performing the inverse FFT.
For the sizes you mention, (1024 x 1280 and 128 x 128), the inputs must be padded to atleast ((1024 + 128 - 1) x (1280 + 128 -1) = 1151 x 1407). But FFTs are fastest when the (padded) inputs are powers of 2. So you will need to pad both the large and small images to size 2048 x 2048.
You could speed up your calculations by using faster memory access, for example by using
- Texture Cache for the big image
- Shared Memory or Constant Cache for the small image or parts of it.
But your real problem is the whole approach of your comparison. Comparing the images pixel by pixel at every possible location will never be efficient. There is just too much work to do. First you should think about finding ways to
- Select the interesting image regions in the big image where the small image might be contained and only search in these
- Find a faster comparison mechanism, by something representing the images that are not their pixels values. You should be able to compare the images by computing a representation with less data, e.g. a color histogram, or integral images.
精彩评论