Why is pointer access slower than vector::iterator access? (compiler code generation)
OK, the question title is a bit crappy, but I didn't really know how to phrase this better.
The problem I have is that given a std::vector<T> vs. a T* + size_t count my compiler (Visual Studio 2005 / VC++ 8) will actually generate worse code when looping over the pointer than when looping over the vector.
That is, I have a test struct containing a vector and another one containing a pointer + count. Now, when writing the semantically exact same looping construct, the version with the std::vector is significantly (which is to say > 10%) faster than the version with the pointer.
Below you will find the code as well as the generated assembly. It would be great if someone could explain what's going on here.
If you look at the assembly, you can note how the raw pointer version generates slightly more instructions. It would already be a very nice answer if anyone could explain how these versions differ semantically on the assembly level.
And please refrain from answers telling me I shouldn't care, premature optimization, root of all evil, etc. In this specific case I do care and anyway I think it is a rather interesting puzzle! :-)
Compiler settings:
- Full Optimization (/Ox)
- Whole Program Opt. = NO
Here comes the code:
stdafx.h
// Disable secure STL stuff!
#define _SECURE_SCL 0
#define _SECURE_SCL_THROWS 0
#include <iostream>
#include <iomanip>
#include <vector>
#include <mmsystem.h>
header file
// loop1.h
typedef int PodType;
const size_t container_size = 3;
extern volatile size_t g_read_size;
void side_effect();
struct RawX {
    PodType* pData;
    PodType wCount;
    RawX()
    : pData(NULL)
    , wCount(0)
    { }
    ~RawX() {
        delete[] pData;
        pData = NULL;
        wCount = 0;
    }
    void Resize(PodType n) {
        delete[] pData;
        wCount = n;
        pData = new PodType[wCount];
    }
private:
    RawX(RawX const&);
    RawX& operator=(RawX const&);
};
struct VecX {
    std::vector<PodType> vData;
};
void raw_loop(const int n, RawX* obj);
void raw_iterator_loop(const int n, RawX* obj);
void vector_loop(const int n, VecX* obj);
void vector_iterator_loop(const int n, VecX* obj);
implementation file
// loop1.cpp
void raw_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(int j=0, e=obj->wCount; j!=e; ++j) {
            g_read_size = obj->pData[j];
            side_effect();
        }
        side_effect();
    }
}
void raw_iterator_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();
    }
}
void vector_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(size_t j=0, e=obj->vData.size(); j!=e; ++j) {
            g_read_size = obj->vData[j];
            side_effect();
        }
        side_effect();
    }
}
void vector_iterator_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();      
    }
}
test main file
using namespace std;
volatile size_t g_read_size;
void side_effect()
{
    g_read_size = 0;
}
typedef size_t Value;
template<typename Container>
Value average(Container const& c)
{
    const Value sz = c.size();
    Value sum = 0;
    for(Container::const_iterator i=c.begin(), e=c.end(); i!=e; ++i)
        sum += *i;
    return sum开发者_C百科/sz;
}
void take_timings()
{
    const int x = 10;
    const int n = 10*1000*1000;
    VecX vobj;
    vobj.vData.resize(container_size);
    RawX robj;
    robj.Resize(container_size);
    std::vector<DWORD> raw_times;
    std::vector<DWORD> vec_times;
    std::vector<DWORD> rit_times;
    std::vector<DWORD> vit_times;
    for(int i=0; i!=x; ++i) {
        const DWORD t1 = timeGetTime();
        raw_loop(n, &robj);
        const DWORD t2 = timeGetTime();
        vector_loop(n, &vobj);
        const DWORD t3 = timeGetTime();
        raw_iterator_loop(n, &robj);
        const DWORD t4 = timeGetTime();
        vector_iterator_loop(n, &vobj);
        const DWORD t5 = timeGetTime();
        raw_times.push_back(t2-t1);
        vec_times.push_back(t3-t2);
        rit_times.push_back(t4-t3);
        vit_times.push_back(t5-t4);
    }
    cout << "Average over " << x << " iterations for loops with count " << n << " ...\n";
    cout << "The PodType is '" << typeid(PodType).name() << "'\n";
    cout << "raw_loop: " << setw(10) << average(raw_times) << " ms \n";
    cout << "vec_loop: " << setw(10) << average(vec_times) << " ms \n";
    cout << "rit_loop: " << setw(10) << average(rit_times) << " ms \n";
    cout << "vit_loop: " << setw(10) << average(vit_times) << " ms \n";
}
int main()
{
    take_timings();
    return 0;
}
Here comes the generated assembly as displayed by the visual studio debugger (for the 2 functions with the "iterators".
*raw_iterator_loop*
void raw_iterator_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
00  mov         eax,dword ptr [esp+4] 
00  test        eax,eax 
00  je          raw_iterator_loop+53h (4028C3h) 
00  push        ebx  
00  mov         ebx,dword ptr [esp+0Ch] 
00  push        ebp  
00  push        esi  
00  push        edi  
00  mov         ebp,eax 
        side_effect();
00  call        side_effect (401020h) 
        for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) {
00  movzx       eax,word ptr [ebx+4] 
00  mov         esi,dword ptr [ebx] 
00  lea         edi,[esi+eax*2] 
00  cmp         esi,edi 
00  je          raw_iterator_loop+45h (4028B5h) 
00  jmp         raw_iterator_loop+30h (4028A0h) 
00  lea         esp,[esp] 
00  lea         ecx,[ecx] 
            g_read_size = *j;
00  movzx       ecx,word ptr [esi] 
00  mov         dword ptr [g_read_size (4060B0h)],ecx 
            side_effect();
00  call        side_effect (401020h) 
00  add         esi,2 
00  cmp         esi,edi 
00  jne         raw_iterator_loop+30h (4028A0h) 
        }
        side_effect();
00  call        side_effect (401020h) 
00  sub         ebp,1 
00  jne         raw_iterator_loop+12h (402882h) 
00  pop         edi  
00  pop         esi  
00  pop         ebp  
00  pop         ebx  
    }
}
00  ret              
*vector_iterator_loop*
void vector_iterator_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
00  mov         eax,dword ptr [esp+4] 
00  test        eax,eax 
00  je          vector_iterator_loop+43h (402813h) 
00  push        ebx  
00  mov         ebx,dword ptr [esp+0Ch] 
00  push        ebp  
00  push        esi  
00  push        edi  
00  mov         ebp,eax 
        side_effect();
00  call        side_effect (401020h) 
        for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) {
00  mov         esi,dword ptr [ebx+4] 
00  mov         edi,dword ptr [ebx+8] 
00  cmp         esi,edi 
00  je          vector_iterator_loop+35h (402805h) 
            g_read_size = *j;
00  movzx       eax,word ptr [esi] 
00  mov         dword ptr [g_read_size (4060B0h)],eax 
            side_effect();
00  call        side_effect (401020h) 
00  add         esi,2 
00  cmp         esi,edi 
00  jne         vector_iterator_loop+21h (4027F1h) 
        }
        side_effect();      
00  call        side_effect (401020h) 
00  sub         ebp,1 
00  jne         vector_iterator_loop+12h (4027E2h) 
00  pop         edi  
00  pop         esi  
00  pop         ebp  
00  pop         ebx  
    }
}
00  ret          
While my version of the generated machine code is different from yours (MSVC++ 2005), one difference between the two variants is pretty much the same as in your code:
- In vector version of the code the "end iterator" value is pre-calculated and stored as a member of - std::vectorobject, so the inner loop simply loads the readily available value.
- In raw pointer version the "end iterator" value is calculated explicitly in the header of the inner cycle (by a - leainstruction used to implement multiplication), meaning that each iteration of the outer cycle performs that calculation again and again.
If you re-implement your raw_iterator_loop as follows (i.e. pull the calculation of the end pointer out of the outer loop)
void raw_iterator_loop(const int n, RawX* obj)
{
    PodType *e = obj->pData+size_t(obj->wCount);
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(PodType *j=obj->pData; j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();
    }
}
(or even store and maintain the end pointer in your class) you should end up with a more "fair" comparison.
One concrete reason for the difference in instructions generated is that Visual C++ vector has members _Myfirst and _Mylast (corresponding to begin() and end()) that simplify the loop setup.  
In the raw case, the compiler has to do actual pointer math to set up the required start and end locals.
It's feasible that this complicates register usage enough to make the vector code faster.
Try changing PodType from int to size_t. The conversion there complicates the loop initialization code.
The compiler has no way to know that side_effect() doesn't change obj->pData.
Storing things that cannot change in local variables can speed up tight loops, especially when functions that the compiler cannot analyze are called inside the loop.
p.S.: this affects raw_loop and vector_loop. raw_loop can be "fixed" by using local variables, vector_loop can't. It can't, because there will be an array-pointer inside the vector, and the compiler also has no way of knowing that nothing will change the array-pointer inside the vector. After all, side_effect could call e.g. push_back(). Of course, if the compiler can inline any of the loop-functions, it could possibly optimize better. E.g. if the used vector is a local variable who's address isn't known outside the function, and only passed to the loop functions. Then the compiler could again know that side_effect cannot change the vector (because it cannot know it's address), and optimize better. -> Make sure the compiler cannot inline the function if you want to optimize it for non-inlining cases.
Your timings might reflect the fact that the initial raw_loop pays the cost for loading the cache. Do you get similar timings if you reorder the tests to do the vector tests first (Or you could throw out the first iteration, or make each test a separate program).
Looking at the assembly generated for the inner loops, they are essentially identical except for one register change. I wouldn't expect any timing difference on that basis.
            g_read_size = *j; 
00  movzx       ecx,word ptr [esi]  
00  mov         dword ptr [g_read_size (4060B0h)],ecx  
            side_effect(); 
00  call        side_effect (401020h)  
00  add         esi,2  
00  cmp         esi,edi  
00  jne         raw_iterator_loop+30h (4028A0h)  
            g_read_size = *j;  
00  movzx       eax,word ptr [esi]   
00  mov         dword ptr [g_read_size (4060B0h)],eax   
            side_effect();  
00  call        side_effect (401020h)   
00  add         esi,2   
00  cmp         esi,edi   
00  jne         vector_iterator_loop+21h (4027F1h)   
I had a similar question on code timing where I couldn't explain a difference in two pieces of code. I never got a definitive answer on that one, although in the end I convinced myself that it was a case of code alignment. How can adding code to a loop make it faster?
I would have expected the optimizer to be smart enough to remove any differences, but typically a decrement-and-compare-to-zero is faster than a comparison to a non-zero pointer.
e.g.
void countdown_loop(int n, RawX* obj)
{
    if (!n) return;
    size_t const wc = obj->wCount;
    if (!wc) return;
    PodType* const first = obj->pData;
    do {
        side_effect();
        size_t i = wc;
        PodType* p = first;
        do {
            g_read_size = *p;
            side_effect();
            ++p;
        } while (--i);
        side_effect();
    } while (--n);
}
I know this is late but one quick observation: unsigned instructions can be slightly faster. As I understand it, the hardware implementation is just a tad simpler because there is no sign bit that can change on increment or decrement. That is just one clock cycle to a P6 micro-architecture CPU but these things all add up.
I expect that size_t is unsigned as there are no negative 'sizes'.
You MUST profile with the chip manufacturers profiler (Intel VTUNE -- 30 day trial; AMD CodeAnalyzer is free) because there are just too many things that can be in play: pipeline stalls, cache misses, misaligned data, store-load dependencies ...
This business is a whole lot more complex than it was 44 years ago when I wrote my first Fortran program in High School. Nobody programs in assembler anymore. One real head-scratcher was looking at PA-Risc (HP Unix system chip in the 90's) instructions generated from C programs ... the operations were not in the order I expected because the code generator understood the internal operations of the PA-Risc CPU. The instructions were ordered funny because they would make sense in the CPU but not on my piece of paper.
 
         加载中,请稍侯......
 加载中,请稍侯......
      
精彩评论