|
; Horizontal filtering, 31-tap FIR: ; ; A% = address of 16-bit coefficient data (signed) ; B% = address of 8-bit source data (signed) ; C% = address step (width in pixels) ; D% = address of destination data (signed) ; .hfilter mov ebp,eax ; coefficients mov esi,ebx ; source mov edi,edx ; dest mov edx,ecx ; step (bytes) n.b. AFTER mov edi,edx mov ebx,OutputWidth% mov ecx,OutputHeight% mov eax,esp and esp,-8 ; align 8 (AMD Athlon) push eax ; save original stack pointer sub esp,12 ; make space, maintaining alignment .hsloop punpcklbw mm0,[esi] ; fetch and unpack video data punpcklbw mm1,[esi+4] punpcklbw mm2,[esi+8] punpcklbw mm3,[esi+12] punpcklbw mm4,[esi+16] punpcklbw mm5,[esi+20] punpcklbw mm6,[esi+24] punpcklbw mm7,[esi+28] psraw mm0,8 ; align and sign-extend psraw mm1,8 psraw mm2,8 psraw mm3,8 psraw mm4,8 psraw mm5,8 psraw mm6,8 psraw mm7,8 pmaddwd mm0,[ebp] ; multiply by coefficients and add pmaddwd mm1,[ebp+8] pmaddwd mm2,[ebp+16] pmaddwd mm3,[ebp+24] pmaddwd mm4,[ebp+32] pmaddwd mm5,[ebp+40] pmaddwd mm6,[ebp+48] pmaddwd mm7,[ebp+56] paddd mm0,mm1 ; accumulate 0-7 paddd mm2,mm3 ; accumulate 8-15 paddd mm4,mm5 ; accumulate 16-23 paddd mm6,mm7 ; accumulate 24-31 paddd mm0,mm2 ; accumulate 0-15 paddd mm4,mm6 ; accumulate 16-31 paddd mm0,mm4 ; accumulate 0-31 movq [esp],mm0 ; store partial sums (esp aligned) mov eax,[esp] add eax,[esp+4] ; accumulate sar eax,15 ; divide-by-32768 adc eax,0 ; rounding call sclip mov [edi],al ; result lea esi,[esi+edx] ; skip to next line lea edi,[edi+ebx] dec ecx jnz near hsloop add esp,12 pop esp ; restore original stack pointer emms ret
|
|