Keresés

Aktív témák

  • P.H.

    senior tag

    válasz P.H. #39 üzenetére

    a #39-beli összehasonlításban használt vonalankénti x87 kód:

    sub ebp,edx
    sub ecx,eax
    pushad
    lea edi,[esi+...+DATA2]
    fild dword ptr [esp+_ECX]
    shl eax,10h
    fild dword ptr [esp+_EBP]
    mov esi,[esi+...DEST]
    fld st(1)
    mov ebx,[edi-..._DATA1]
    fabs
    fld st(1)
    fabs
    fcompp
    sub edi,ebx
    fnstsw ax
    sahf
    ja @inlineMOVEX
    mov ebp,ecx
    fxch
    @inlineMOVEX:
    test ebp,ebp
    fdiv
    fld1
    jz @return
    jns @inlineSETDIR
    neg ebp
    fchs
    @inlineSETDIR:
    sahf
    fmul st(1),st
    jbe @inlineCOORDINATES
    fxch
    @inlineCOORDINATES:
    fild dword ptr [esp+_EDX]
    sar eax,10h
    fild dword ptr [esp+_EAX]
    @setpixel:
    cmp edx,[edi-...+_TOPLEFT]
    fadd st,st(2)
    setl cl
    cmp edx,[edi-...+_BOTTOM]
    setge ch
    imul edx,[edi-...+_DX]
    or ch,cl
    cmp eax,[edi-...+_RIGHT]
    setge cl
    add edx,eax
    or ch,cl
    cmp eax,[edi-...+_TOPLEFT]
    mov [esp+_ECX],ebx
    setl cl
    mov al,[edi+ebx-_ADDER+_DRAWCOLOR]
    or cl,ch
    jnz @continueLINE
    @round:
    add edx,[edi+ebx]
    add ebx,04h
    mov [esi+edx],al
    js @round
    mov ebx,[esp+_ECX]
    @continueLINE:
    fxch
    fadd st,st(3)
    sub ebp,01h
    fist dword ptr [esp+_ECX]
    fxch
    mov edx,[esp+_ECX]
    fist dword ptr [esp+_ECX]
    mov eax,[esp+_ECX]
    jge @setpixel
    fcompp
    @return:
    popad
    fcompp

    [ Szerkesztve ]

    Arguing on the Internet is like running in the Special Olympics. Even if you win, you are still ... ˙˙˙ Real Eyes Realize Real Lies ˙˙˙

  • P.H.

    senior tag

    válasz P.H. #39 üzenetére

    Line algoritmus nagy mennyiségű vonalhoz, új megfogalmazásban:

    bemenő paraméterek:
    EAX: X0 coordinate
    EDX: Y0 coordinate
    ECX: X1 coordinate
    EBP: Y1 coordinate
    ESI: BITS array
    EDI: _ADDER array
    MM7: [$0000][$0000][width][ 1] // image width + 1 (UINTs)
    XMM5 [ CHS][ CHS][ CHS][ CHS] // sign change constants
    XMM6: [bttom][right][ top][ left] // image boundaries
    XMM7: [-----][-----][-----][ 1.0]

    cvtsi2ss xmm2,eax
    sub ecx,eax
    sub ebp,edx
    push ebx
    mov eax,ecx
    mov ebx,ecx
    sar eax,1Fh
    cvtsi2ss xmm3,edx
    xor ebx,eax
    mov edx,ebp
    sub ebx,eax
    mov eax,ebp
    sar eax,1Fh
    xor edx,eax
    sub edx,eax
    cmp edx,ebx
    jae @movement
    xchg ebp,ecx
    @movement:
    cvtsi2ss xmm1,ebp
    shufps xmm2,xmm3,01000100b
    test ebp,ebp
    jz @return
    cvtsi2ss xmm0,ecx
    rcpss xmm1,xmm1
    shufps xmm2,xmm2,10001000b
    mulss xmm0,xmm1
    shufps xmm0,xmm7,00000000b
    jns @direction
    neg ebp
    xorps xmm0,xmm5
    @direction:
    cmp edx,ebx
    mov ebx,[edi+_PENWIDTH]
    jae @inlineCOORDINATES
    shufps xmm0,xmm0,11000110b
    @inlineCOORDINATES:
    mov al,[edi+_DRAWCOLOR]
    sub edi,ebx
    shufps xmm0,xmm0,10001000b
    @setpixel:
    cvtps2pi mm0,xmm2
    movaps xmm4,xmm6
    cmpltps xmm6,xmm2
    pshufw mm0,mm0,11111000b
    addps xmm2,xmm0
    movmskps edx,xmm6
    movaps xmm6,xmm4
    pmaddwd mm0,mm7
    cmp edx,03h
    jnz @continueLINE
    movd edx,mm0
    mov ecx,ebx
    @rounds:
    add edx,[edi+ecx]
    add ecx,04h
    mov [esi+edx],al
    js @rounds
    @continueLINE:
    sub ebp,01h
    jge @setpixel
    add edi,ebx
    @return:
    pop ebx

    [ Szerkesztve ]

    Arguing on the Internet is like running in the Special Olympics. Even if you win, you are still ... ˙˙˙ Real Eyes Realize Real Lies ˙˙˙

  • P.H.

    senior tag

    válasz P.H. #39 üzenetére

    SSE1 line algoritmus azonos stílusú vonalak láncolt listájára, a #39-hez képest megtáltosítva; align/cím- és Northwood + Merom + K10 portelemzéssel.

    Intel processzorokon Core2 óta a MOVD reg,mmreg (2 órajel) gyorsabb, mint az L1D-olvasás (3-4 órajeé), a Sandy Bridge óra pedig kvázi +regiszterkészletként használhatóak az MMX és XMM regiszterek (1 órajel a MOVD oda-vissza). K10-en sem lassabb a MOVD reg,mmreg, mint az L1D-hozzáférés.

    {@31} lea edi,[esi+TBITMAPFILE.IDATA+_ADDER] // p01 d (1) alu p0 1 (1) p012 1 (1) ALU
    xorps xmm1,xmm1 // p1 2 (1) mmxalu p015 1 (1) p34 2 (1) FA/M
    mov eax,[esi+TBITMAPFILE.IDATA+_DX] // p2 2 (1) load p2 2 (1) p012 3 (1) MEM
    {@40} cvtpi2ps xmm3,[edi-_ADDER+_TOPLEFT0] // p1+2 10 (4) mmx+load p1+2 (1) p34+5 7 (2) FPU+MEM
    pcmpeqd xmm4,xmm4 // p1 2 (1) mmxalu p01 1 (1) p34 2 (1) FA/M
    cvtpi2ps xmm2,[edi-_ADDER+_RIGHT] // p1+2 10 (4) mmx+load p1+2 (1) p34+5 7 (2) FPU+MEM
    pcmpeqd xmm7,xmm7 // p1 2 (1) mmxalu p01 1 (1) p34 2 (1) FA/M
    {@50} mov ecx,[edi-_ADDER+_PEN] // p2 2 (1) load p2 2 (1) p012 3 (1) MEM
    pslld xmm4,25 // p1 2 (1) mmxshf p0 1 (1) p34 3 (1) FA/M
    mov esi,[esi+TBITMAPFILE.BITS] // p2 2 (1) load p2 2 (1) p012 3 (1) MEM
    pslld xmm7,1Fh // p1 2 (1) mmxshf p0 1 (1) p34 3 (1) FA/M
    {@60} movd mm2,[edi-_ADDER+_COLOR] // p2 8 (1) mmxalu p2 2 (1) p345 4 (1) FANY
    shl eax,10h // p1 4 (1) mmxshf p05 1 (1) p012 1 (1) ALU
    movlhps xmm3,xmm2 // p1 2 (1) mmxshf p0 1 (1) p34 3 (1) FA/M
    add eax,01h // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    sub edi,ecx // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    {@6F} psrld xmm4,02h // p1 2 (1) mmxshf p0 1 (1) p34 3 (1) FA/M
    movd mm0,eax // p1 2 (2) mmxalu p05 2 (1) p012 6 (2) ALU
    jmp @1stline // p1 0 (1) branch p5 1 (1) p012 2 (1) ALU
    { x7 } mov eax,00000000h; mov edx,ecx //
    @reorder: //
    {@80} shufps xmm0,xmm0,11011000b // p1 4 (1) mmxshf p1 3 (3) p34 3 (1) FA/M
    @setpixels: //
    {@84} cvtps2pi mm1,xmm5 // p0+1 7 (3) fp-mmx p1 3 (1) p5 4 (1) FMISC
    movaps xmm2,xmm3 // p0 6 (1) mov p015 1 (1) p345 2 (1) FANY
    mov ebx,ecx // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    pshufw mm1,mm1,11011000b // p1 2 (1) mmxshf p5 1 (1) p34 2 (1) FA/M
    {@90} cmpltps xmm2,xmm5 // p1 4 (1) fpadd p1 3 (1) p3 (1) FADD
    addps xmm5,xmm0 // p1 4 (1) fpadd p1 3 (1) p3 4 (1) FADD
    pmaddwd mm1,mm0 // p1 6 (1) fpmul p1 3 (1) p4 3 (1) FMUL
    movmskps edx,xmm2 // p1 6 (2) fp p0 1 (1) p34 3 (1) FA/M
    cmp edx,03h // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    {@A0} jnz @continueLINE // p0 2 (1) alu p5 1 (1) p012 1 (1) ALU
    movd edx,mm1 // p0 5 (2) fp p015 2 (1) p3 3 (1) FADD
    @round: //
    add edx,[edi+ebx] // p01+2 d+2(2) alu+load p015+2 (1) p012 4 (1) ALU+MEM
    mov [esi+edx],al // p0+3 2 (3) store p 34 3 (1) p012 3 (1) MEM
    add ebx,04h // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    js @round // p0 0 (1) branch p5 1 (1) p012 1 (1) ALU
    @continueLINE: //
    {@B0} sub ebp,01h // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    jge @setpixel // p0 0 (1) branch p5 1 (1) p012 1 (1) ALU
    @nxline: //
    {@B5} movd ebx,mm3 // p0 5 (2) fp p015 2 (1) p3 3 (1) FADD
    @1stline: //
    {@B8} cmp ebx,00h // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    jz @return // p0 0 (1) branch p5 1 (1) p012 1 (1) ALU
    mov eax,[ebx+TMAPRECORD.REF] // p2 2 (1) load p2 2 (1) p012 3 (1) MEM
    {@C0} mov edx,[ebx+TMAPRECORD.SELF] // p2 2 (1) load p2 2 (1) p012 3 (1) MEM
    mov ebp,[eax+TMAPHEADER.YCOOR] // p2 2 (1) load p2 2 (1) p012 3 (1) MEM
    cvtpi2ps xmm5,[edx+TMAPHEADER.XCOOR] // p1+2 10+2(4) mmx+load p1+2 (1) p34+5 7 (2) FPU+MEM
    mov eax,[eax+TMAPHEADER.XCOOR] // p2 2 (1) load p2 2 (1) p012 3 (1) MEM
    sub ebp,[edx+TMAPHEADER.YCOOR] // p01+2 d+2(2) alu+load p015+2 (1) p012 4 (1) ALU+MEM
    sub eax,[edx+TMAPHEADER.XCOOR] // p01+2 d+2(2) alu+load p015+2 (1) p012 4 (1) ALU+MEM
    {@D0} xor edx,edx // p0 d (1) logic p015 1 (1) p012 1 (1) ALU
    movlhps xmm5,xmm5 // p1 2 (1) mmxshf p0 1 (1) p34 3 (1) FA/M
    movd mm3,ds:[ebx+TMAPRECORD.NX] // p2 8 (1) mmxalu p2 2 (1) p345 4 (1) FANY
    xor ebx,ebx // p0 d (1) logic p015 1 (1) p012 1 (1) ALU
    sub edx,ebp // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    cmovs edx,ebp // p0+1 6 (3) alu p015 2 (2) p012 1 (1) ALU
    {@E0} sub ebx,eax // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    cmovs ebx,eax // p0+1 6 (3) alu p015 2 (2) p012 1 (1) ALU
    cmp edx,ebx // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    mov ebx,ebp // p01 d (1) alu p015 1 (1) p012 1 (1) ALU
    cmovb ebx,eax // p0+1 6 (3) alu p015 2 (2) p012 1 (1) ALU
    cmovb eax,ebp // p0+1 6 (3) alu p015 2 (2) p012 1 (1) ALU
    cmovb ebp,ebx // p0+1 6 (3) alu p015 2 (2) p012 1 (1) ALU
    {@F2} sbb edx,edx // p1 5 (3) alu p015 2 (2) p012 1 (1) ALU
    neg ebx // p0 d (1) alu0 p015 1 (1) p012 1 (1) ALU
    mov ecx,ecx //
    cvtsi2ss xmm0,eax // p1 10 (3) fp-mmx p1 4 (1) p345 14 (v) FPU+ALU
    cvtsi2ss xmm1,ebp // p1 10 (3) fp-mmx p1 4 (1) p345 14 (v) FPU+ALU
    {@00} movaps xmm2,xmm1 // p0 6 (1) mov p015 1 (1) p345 2 (1) FANY
    divss xmm0,xmm1 // p1 23 (1) fpdiv p0 17 (1) p4 16 (1) FMUL
    cmovns ebp,ebx // p0+1 6 (3) alu p015 2 (2) p012 1 (1) ALU
    shufps xmm2,xmm2,00000000b // p1 4 (1) mmxshf p1 3 (3) p34 3 (1) FA/M
    test edx,edx // p0 d (1) logic p015 1 (1) p012 1 (1) ALU
    {@10} andps xmm2,xmm7 // p1 2 (1) mmxalu p015 1 (1) p34 2 (1) FA/M
    shufps xmm0,xmm4,00000000b // p1 4 (1) mmxshf p1 3 (3) p34 3 (1) FA/M
    movd eax,mm2 // p0 5 (2) fp p015 2 (1) p3 3 (1) FADD
    xorps xmm0,xmm2 // p1 2 (1) mmxalu p015 1 (1) p34 2 (1) FA/M
    {@1D} jz @reorder // p0 0 (1) branch p5 1 (1) p012 2 (1) ALU
    {@23} shufps xmm0,xmm0,01110010b // p1 4 (1) mmxshf p1 3 (3) p34 3 (1) FA/M
    jmp @setpixels // p1 0 (1) branch p5 1 (1) p012 2 (1) ALU
    @return:
    popad
    emms

    [ Szerkesztve ]

    Arguing on the Internet is like running in the Special Olympics. Even if you win, you are still ... ˙˙˙ Real Eyes Realize Real Lies ˙˙˙

Aktív témák