Help me improve some more SSE2 code

2023-02-13 07:39 问答作者：

I am looking for some help to improve this bilinear scaling sse2 code on core2 cpus

On my Atom N270 and on an i7 this code is about 2x faster than the mmx code. But under core2 cpus it is only equal to the mmx code.

Code follows

void ConversionProcess::convert_SSE2(BBitmap *from, BBitmap *to)
{
    uint32 fromBPR, toBPR, fromBPRDIV4, x, y, yr, xr;

    ULLint start = rdtsc();
    ULLint stop;
    if (from && to) {
        uint32 width, height;
        width = from->Bounds().IntegerWidth() + 1;
        height = from->Bounds().IntegerHeight() + 1;

        uint32 toWidth, toHeight;
        toWidth = to->Bounds().IntegerWidth() + 1;
        toHeight = to->Bounds().IntegerHeight() + 1;

        fromBPR = from->BytesPerRow();
        fromBPRDIV4 = fromBPR >> 2;
        toBPR = to->BytesPerRow();

        uint32 x_ratio = ((width-1) << 7) / toWidth ;
        uint32 y_ratio = ((height-1) << 7) / toHeight ;

        uint8* toPtr = (uint8*)to->Bits();
        uint8* fromPtr1 = (uint8*)from->Bits();
        uint8* fromPtr2 = (uint8*)from->Bits() + fromBPR;

        struct FilterInfo {
            uint16 one_minus_diff;      // one minus diff
            uint16 diff;                // diff value used to calculate the weights used to average the pixels
            uint16 one_minus_diff_rep;  // one minus diff repeated
            uint16 diff_rep;            // diff value used to calculate the weights used to average the pixels repeated
        };

        FilterInfo *xWeights = (FilterInfo *)memalign(16, toWidth * 8);
        FilterInfo *yWeights = (FilterInfo *)memalign(16, toHeight * 8);
        uint32 *xIndexes = (uint32 *)memalign(16, (toWidth+2) * 4);     // will overread by 2 index
        uint32 *yIndexes = (uint32 *)memalign(16, toHeight * 4);

        x = 0;
        for (uint32 j=0;j < toWidth;j++) {
            xr = x >> 7;
            xWeights[j].diff = x - (xr << 7);
            xWeights[j].one_minus_diff = 127 - xWeights[j].diff;
            xWeights[j].one_minus_diff_rep = xWeights[j].one_minus_diff;
            xWeights[j].diff_rep = xWeights[j].diff;
            xIndexes[j] = xr << 2;

            x += x_ratio;
        }

        y = 0;
        for (uint32 j=0;j < toHeight; j++) {
            yr = y >> 7;
            yWeights[j].diff = y - (yr << 7);
            yWeights[j].one_minus_diff = 127 - yWeights[j].diff;
            yIndexes[j] = (yr * fromBPR);
            y += y_ratio;

        }

        for (uint32 i=0;i < toHeight; i++) {
            _ScaleSSE2X2(toPtr, fromPtr1 + yIndexes[i], fromPtr2 + yIndexes[i], xIndexes, xWeights, &yWeights[i], toWidth);
            toPtr += toBPR; 
        }

        free(xWeights);
        free(yWeights);
        free(xIndexes);
        free(yIndexes);

        stop = rdtsc() - start;
        if (stop < timeTaken) {
            timeTaken = stop;
        }
    }
}

;
; Copyright (C) 2011 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;

; A rather unoptimised bilinear scaler

%macro  cglobal 1
    global  _%1
    %define %1 _%1
    align 16
%1:
%endmacro

SECTION .data align=16

RGB_AND db  0xff
        db  0x00
        db  0x00
        db  0x00
        db  0xff
        db  0x00
        db  0x00
        db  0x00
        db  0xff
        db  0x00
        db  0x00
        db  0x00
        db  0xff
        db  0x00
        db  0x00
        db  0x00

; void  _ScaleSSE2X2(void *toPtr, void *fromPtr1, void *fromPtr2, void* xIndexPtr, void *xWeightPtr, void *yWeightPtr, uint32 length);

length      equ ebp+32
yWeightPtr  equ ebp+28
xWeightPtr  equ ebp+24
xIndexPtr   equ ebp+20
fromPtr2    equ ebp+16
fromPtr1    equ ebp+12
toPtr       equ ebp+8

SECTION .text align=16
cglobal ScaleSSE2X2
; reserve registers. eax, ecx, edx automatically available
    push ebp
    mov ebp, esp
    push ebx    ; yWeights, xIndexPtr
    push edi    ; scratch
    push esi    ; fromPtr3

    mov esi, [fromPtr1]
    mov edx, [fromPtr2]
    mov eax, [xWeightPtr]
    mov ebx, [yWeightPtr]
    mov ecx, [length]

; calculate y weights and cache
    movd xmm7, [ebx]                ; get 1-yDiff and yDiff
    pshuflw xmm7, xmm7, 01010000b   ; 1-yDiff, 1-yDiff, yDiff, yDiff
    pshufd xmm7, xmm7, 01000100b    ; duplicate

    mov ebx, [xIndexPtr]

    push ebp                        ; reuse frame ptr for toPtr
    mov ebp, [toPtr]                ; Cannot use parameter refs anymore

    shr ecx,1

    ; calculate first index
    mov edi, [ebx]                  ; index

    align 16
REPEATLOOPX2:

    ; load first and second set of weights into xmm3
    movdqa xmm3, [eax]              ; get 1-xDiff, xDiff, 1-xDiff, xDiff
    pmullw xmm3, xmm7               ; calculate F1, F2, F3, F4 (2)
    add eax, 16

    ; load first set of source pixels
    movq xmm0, [esi+edi]            ; xmm0 = fromPtr1 + index | fromPtr1 + index + 4
    movq xmm1, [edx+edi]            ; xmm1 = fromPtr2 + index | fromPtr2 + index + 4
    punpcklqdq xmm0, xmm1           ; combine all 4 pixels into xmm0

    sub edi, [ebx+4]                ; if the x index is the same then skip the second load
    jz SKIP

    ; calculate second index
    mov edi, [ebx+4]                ; index

    ; load second set of source pixels
    movq xmm4, [esi+edi]            ; xmm4 = fromPtr1 + index | fromPtr1 + index + 4
    movq xmm5, [edx+edi]            ; xmm5 = fromPtr2 + index | fromPtr2 + index + 4
    punpcklqdq xmm4, xmm5           ; combine all 4 pixels into xmm4

    movdqa xmm1, xmm0               ; copy to xmm1, xmm开发者_如何学Python2
    pshufd xmm2, xmm0, 0xE4 
    movdqa xmm5, xmm4               ; copy to xmm1, xmm2
    pshufd xmm6, xmm4, 0xE4 

    jmp NEXT
align 16
SKIP:
    movdqa xmm1, xmm0               ; copy to xmm1, xmm2
    pshufd xmm2, xmm0, 0xE4 
    movdqa xmm4, xmm0               ; copy first pixel set xmm0 to second pixel set xmm4
    pshufd xmm5, xmm4, 0xE4         ; copy to xmm4, xmm6
    movdqa xmm6, xmm4               
NEXT:
;   prefetchnta [edx+edi+16]

    add ebx, 8

; calculate dest rgb values using color = a * F1 + b * F2 + c * F3 + d * F4

; extract b from both sets of pixels and combine into a single reg
    pand xmm0, [RGB_AND]            ; clear all but r values leaving b000
    pand xmm4, [RGB_AND]            ; clear all but r values leaving b000
    packssdw xmm0, xmm4             ; pack down to 16 bit values

    movdqa xmm4, [RGB_AND]          ; xmm4 is now free
    pmaddwd xmm0, xmm3              ; multiply and add to get temp1 = a * F1 + b * F2, temp2 = c * F3 + d * F4

; extract g
    psrld xmm1, 8                   ; rotate g to low bytes
    pand xmm1, xmm4                 ; extract g values g000
    psrld xmm5, 8                   ; rotate g to low bytes
    pand xmm5, xmm4                 ; extract g values g000
    packssdw xmm1, xmm5             ; pack down to 16 bit values

    pmaddwd xmm1, xmm3              ; multiply and add

; extract r
    psrld xmm2, 16                  ; rotate b to low bytes
    pand xmm2, xmm4                 ; extract b values b000
    psrld xmm6, 16                  ; rotate b to low bytes
    pand xmm6, xmm4                 ; extract b values b000
    packssdw xmm2, xmm6             ; pack down to 16 bit values

    pmaddwd xmm2, xmm3              ; multiply and add

;   Add temp1 and temp2 leaving us with rrrr xxxx rrrr xxxx
    psrld xmm0, 14                  ; scale back to range
    pshufd xmm3, xmm0, 00110001b    ; extract temp2
    paddd xmm0, xmm3                ; add back to temp1

    psrld xmm1, 14                  ; scale back to range
    pshufd xmm3, xmm1, 00110001b
    paddd xmm1, xmm3                ; add

    psrld xmm2, 14                  ; scale back to range
    pshufd xmm3, xmm2, 00110001b
    paddd xmm2, xmm3                ; add

;   recombine into 2 rgba values

    pslld xmm1, 8
    por xmm0, xmm1
    pslld xmm2, 16
    por xmm0, xmm2
    pshufd xmm0, xmm0, 00001000b    ; shuffle down

    movq [ebp], xmm0                ; output 32bit * 2
    add ebp, 8

    mov edi, [ebx]                  ; index

    sub ecx, 1
    jnz REPEATLOOPX2

; Cleanup

    pop ebp
    pop esi
    pop edi
    pop ebx
    mov esp, ebp
    pop ebp
    ret

Two suggestions:

run this code in a test harness under a decent profiler on Core 2 (e.g. Zoom) to see where the hotspots and dependency/other stalls are
re-write the SIMD code using intrinsics and then let the compiler handle register allocation, instruction scheduling and other optimisations - a decent compiler such as ICC, or even gcc will do a lot better job than your hand-coded assembly. And as a bonus you can also re-target for different x86 CPU families without having to re-write your code.

继续阅读：assembly simd sse x86

Help me improve some more SSE2 code

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？