Buffer communication speed nightmare
I'm trying to use buffers to communicate between several 'layers' (threads) in my program and now that I have visual output of what's going on inside, I realize there's a devastating amount of time being eaten up in the process of using these buffers.
Here's some notes about what's going on in my code.
- when the rendering mode is triggered in this thread, it begins sending as many points as it can to the layer (thread) below it
- the points from the lower thread are then processed and returned to this thread via the output buffer of the lower thread
points received back are mapped (for now) as white pixels in the D3D surface
if I bypass the buffer and put the points directly into the surface pixels, it only takes about 3 seconds to do the whole job
- if I hand the point down and then have the lower layer pass it right back up, skipping any actual number-crunching, the whole job takes about 30 minutes (which makes the whole program useless)
changing the size of my buffers has no noticeable effect on the speed
I was originally using MUTEXes in my buffers but have eliminated them in attempt the fix the problem
Is there something I can do differently to fix this speed problem I'm having? ...something to do with the way I'm handling these messages???
Here's my code I'm very sorry that it's such a mess. I'm having to move way too fast on this project and I've left a lot of pieces laying around in comments where I've been experimenting.
DWORD WINAPI CONTROLSUBSYSTEM::InternalExProcedure(__in LPVOID lpSelf)
{
XMSG xmsg;
LPCONTROLSUBSYSTEM lpThis = ((LPCONTROLSUBSYSTEM)lpSelf);
BOOL bStall;
BOOL bRendering = FALSE;
UINT64 iOutstandingPoints = 0; // points that are out being tested
UINT64 iPointsDone = 0;
UINT64 iPointsTotal = 0;
BOOL bAssigning;
DOUBLE dNextX;
DOUBLE dNextY;
while(1)
{
if( lpThis->hwTargetWindow!=NULL && lpThis->d3ddev!=NULL )
{
lpThis->d3ddev->Clear(0,NULL,D3DCLEAR_TARGET,D3DCOLOR_XRGB(0,0,0),1.0f,0);
if(lpThis->d3ddev->BeginScene())
{
lpThis->d3ddev->StretchRect(lpThis->sfRenderingCanvas,NULL,lpThis->sfBackBuffer,NULL,D3DTEXF_NONE);
lpThis->d3ddev->EndScene();
}
lpThis->d3ddev->Present(NULL,NULL,NULL,NULL);
}
//bStall = TRUE;
// read input buffer
if(lpThis->bfInBuffer.PeekMessage(&xmsg))
{
bStall = FALSE;
if( HIBYTE(xmsg.wType)==HIBYTE(CONT_MSG) )
{
// take message off
lpThis->bfInBuffer.GetMessage(&xmsg);
// double check consistency
if( HIBYTE(xmsg.wType)==HIBYTE(CONT_MSG) )
{
switch(LOBYTE(xmsg.wType))
{
case SETRESOLUTION_MSG:
lpThis->iAreaWidth = (UINT)xmsg.dptPoint.X;
lpThis->iAreaHeight = (UINT)xmsg.dptPoint.Y;
lpThis->sfRenderingCanvas->Release();
if(lpThis->d3ddev->CreateOffscreenPlainSurface(
(UINT)xmsg.dptPoint.X,(UINT)xmsg.dptPoint.Y,
D3DFMT_X8R8G8B8,
D3DPOOL_DEFAULT,
&(lpThis->sfRenderingCanvas),
NULL)!=D3D_OK)
{
MessageBox(NULL,"Error resizing surface.","ERROR",MB_ICONERROR);
}
else
{
D3DLOCKED_RECT lrt;
if(D3D_OK == lpThis->sfRenderingCanvas->LockRect(&lrt,NULL,0))
{
lpThis->iPitch = lrt.Pitch;
VOID *data;
data = lrt.pBits;
ZeroMemory(data,lpThis->iPitch*lpThis->iAreaHeight);
lpThis->sfRenderingCanvas->UnlockRect();
MessageBox(NULL,"Surface Resized","yay",0);
}
else
{
MessageBox(NULL,"Error resizing surface.","ERROR",MB_ICONERROR);
}
}
break;
case SETCOLORMETHOD_MSG:
break;
case SAVESNAPSHOT_MSG:
lpThis->SaveSnapshot();
break;
case FORCERENDER_MSG:
bRendering = TRUE;
iPointsTotal = lpThis->iAreaHeight*lpThis->iPitch;
iPointsDone = 0;
MessageBox(NULL,"yay, render something!",":o",0);
break;
default:
break;
}
}// else, lost this message
}
else
{
if( HIBYTE(xmsg.wType)==HIBYTE(MATH_MSG) )
{
XMSG xmsg2;
switch(LOBYTE(xmsg.wType))
{
case RESETFRAME_MSG:
case ZOOMIN_MSG:
case ZOOMOUT_MSG:
case PANUP_MSG:
case PANDOWN_MSG:
case PANLEFT_MSG:
case PANRIGHT_MSG:
开发者_如何学运维 // tell self to start a render
xmsg2.wType = CONT_MSG|FORCERENDER_MSG;
if(lpThis->bfInBuffer.PutMessage(&xmsg2))
{
// pass it down
while(!lpThis->lplrSubordinate->PutMessage(&xmsg));
// message passed so pull it from buffer
lpThis->bfInBuffer.GetMessage(&xmsg);
}
break;
default:
// pass it down
if(lpThis->lplrSubordinate->PutMessage(&xmsg))
{
// message passed so pull it from buffer
lpThis->bfInBuffer.GetMessage(&xmsg);
}
break;
}
}
else if( lpThis->lplrSubordinate!=NULL )
// pass message down
{
if(lpThis->lplrSubordinate->PutMessage(&xmsg))
{
// message passed so pull it from buffer
lpThis->bfInBuffer.GetMessage(&xmsg);
}
}
}
}
// read output buffer from subordinate
if( lpThis->lplrSubordinate!=NULL && lpThis->lplrSubordinate->PeekMessage(&xmsg) )
{
bStall = FALSE;
if( xmsg.wType==(REPLY_MSG|TESTPOINT_MSG) )
{
// got point test back
D3DLOCKED_RECT lrt;
if(D3D_OK == lpThis->sfRenderingCanvas->LockRect(&lrt,NULL,0))
{
INT pitch = lrt.Pitch;
VOID *data;
data = lrt.pBits;
INT Y=dRound((xmsg.dptPoint.Y/(DOUBLE)100)*((DOUBLE)lpThis->iAreaHeight));
INT X=dRound((xmsg.dptPoint.X/(DOUBLE)100)*((DOUBLE)pitch));
// decide color
if( xmsg.iNum==0 )
((WORD *)data)[X+Y*pitch] = 0xFFFFFFFF;
else
((WORD *)data)[X+Y*pitch] = 0xFFFFFFFF;
// message handled so remove from buffer
lpThis->lplrSubordinate->GetMessage(&xmsg);
lpThis->sfRenderingCanvas->UnlockRect();
}
}
else if(lpThis->bfOutBuffer.PutMessage(&xmsg))
{
// message sent so pull the real one off the buffer
lpThis->lplrSubordinate->GetMessage(&xmsg);
}
}
if( bRendering && lpThis->lplrSubordinate!=NULL )
{
bAssigning = TRUE;
while(bAssigning)
{
dNextX = 100*((DOUBLE)(iPointsDone%lpThis->iPitch))/((DOUBLE)lpThis->iPitch);
dNextY = 100*(DOUBLE)((INT)(iPointsDone/lpThis->iPitch))/(DOUBLE)(lpThis->iAreaHeight);
xmsg.dptPoint.X = dNextX;
xmsg.dptPoint.Y = dNextY;
//
//xmsg.iNum = 0;
//xmsg.wType = REPLY_MSG|TESTPOINT_MSG;
//
xmsg.wType = MATH_MSG|TESTPOINT_MSG;
/*D3DLOCKED_RECT lrt;
if(D3D_OK == lpThis->sfRenderingCanvas->LockRect(&lrt,NULL,0))
{
INT pitch = lrt.Pitch;
VOID *data;
data = lrt.pBits;
INT Y=dRound((dNextY/(DOUBLE)100)*((DOUBLE)lpThis->iAreaHeight));
INT X=dRound((dNextX/(DOUBLE)100)*((DOUBLE)pitch));
((WORD *)data)[X+Y*pitch] = 0xFFFFFFFF;
lpThis->sfRenderingCanvas->UnlockRect();
}
iPointsDone++;
if( iPointsDone>=iPointsTotal )
{
MessageBox(NULL,"done rendering","",0);
bRendering = FALSE;
bAssigning = FALSE;
}
*/
if( lpThis->lplrSubordinate->PutMessage(&xmsg) )
{
bStall = FALSE;
iPointsDone++;
if( iPointsDone>=iPointsTotal )
{
MessageBox(NULL,"done rendering","",0);
bRendering = FALSE;
bAssigning = FALSE;
}
}
else
{
bAssigning = FALSE;
}
}
}
//if( bStall )
//Sleep(10);
}
return 0;
}
}
(still getting used to this forum's code block stuff)
Edit:
Here's an example that I perceive to be similar in concept, although this example consumes the messages it produces in the same thread.
#include <Windows.h>
#include "BUFFER.h"
int main()
{
BUFFER myBuffer;
INT jobsTotal = 1024*768;
INT currentJob = 0;
INT jobsOut = 0;
XMSG xmsg;
while(1)
{
if(myBuffer.PeekMessage(&xmsg))
{
// do something with message
// ...
// if successful, remove message
myBuffer.GetMessage(&xmsg);
jobsOut--;
}
while( currentJob<jobsTotal )
{
if( myBuffer.PutMessage(&xmsg) )
{
currentJob++;
jobsOut++;
}
else
{
// buffer is full at the moment
// stop for now and put more on later
break;
}
}
if( currentJob==jobsTotal && jobsOut==0 )
{
MessageBox(NULL,"done","",0);
break;
}
}
return 0;
}
This example also runs in about 3 seconds, as opposed to 30 minutes.
Btw, if anybody knows why visual studio keeps trying to make me say PeekMessageA and GetMessageA instead of the actual names I defined, that would be nice to know as well.
Locking and Unlocking an entire rect to change a single point is probably not very efficient, you might be better off generating a list of points you intend to modify and then locking the rect once, iterating over that list and modifying all the points, and then unlocking the rect.
When you lock the rect you are effectively stalling concurrent access to it, so its like a mutex for the GPU in that respect - then you only modify a single pixel. Doing this repeatedly for each pixel will constantly stall the GPU. You could use D3DLOCK_NOSYSLOCK to avoid this to some extent, but I'm not sure if it will play nicely in the larger context of your program.
I'm obviously not entirely sure what the goal of your algorithm is, but if you are trying to parallel process pixels on a d3d surface, then i think the best approach would be via a shader on the GPU.
Where you basically generate an array in system memory, populate it with "input" values on a per point/pixel basis, then generate a texture on a GPU from the array. Next you paint the texture to a full screen quad, and then render it with a pixel shader to some render target. The shader can be coded to process each point in whatever way you like, the GPU will take care of optimizing parallelization. Then you generate a new texture from that render target and then you copy that texture into a system memory array. And then you can extract all your outputs from that array. You can also apply multiple shaders to the render target result back into the render target to pipeline multiple transformations if needed.
A couple notes:
Don't write your own messape-passing code. It may be correct and slow, or fast and buggy. It takes a lot of experience to design code that's fast and then getting it bug-free is really hard, because debugging threaded code is hard. Win32 provides a couple of efficient threadsafe queues:
SList
and the window message queue.Your design splits up work in the worst possible way. Passing information between threads is expensive even under the best circumstances, because it causes cache contention, both on the data and on the synchronization objects. It's MUCH better to split your work into distinct non-interacting (or minimize interaction) datasets and give each to a separate thread, that is then responsible for all stages of processing that dataset.
Don't poll.
That's likely to be the heart of the problem. You have a task continually calling peekmessage
and probably finding nothing there. This will just eat all available CPU. Any task that wants to post messages is unlikely to receive any CPU time to acheive this.
I can't remember how you'd achieve this with the windows message queue (probably WaitMessage
or some variant) but typically you might implement this with a counting semaphore. When the consumer wants data, it waits for the semaphore to be signalled. When the producer has data, it signals the semaphore.
I managed to resolve it by redesigning the whole thing
It now passes huge payloads instead of individual tasks
(I'm the poster)
精彩评论