Author |
Topic: Letting GCC do the hard work (Read 1769 times) |
|
David Williams
Developer
member is offline

meh

Gender: 
Posts: 452
|
 |
Letting GCC do the hard work
« Thread started on: Aug 2nd, 2014, 6:26pm » |
|
You should be able to copy & paste the following code into the BB4W IDE and run it. The point of posting this here is not really the pretty graphics, but to show how some code was written in C (of which I have practically no experience), compiled to produce an assembly language dump which was subsequently 'imported' into BB4W (worked first time after some necessary modifications!). Big time saver. Enjoy.
Code:
REM WavePlasma6 // 02-08-2014
*ESC OFF
*FLOAT 64
ON ERROR PROCError(REPORT$ + " at line " + STR$ERL)
HIMEM = PAGE + 10*&100000
R% = RND(-7109271) : REM Seed BB4W's PRNG
Delay% = TRUE
ScrW% = 512
ScrH% = 512
PROCFixWndSz
VDU 23,22,ScrW%;ScrH%;8,16,16,0 : OFF
dibs% = FNCreateDIBSection
GetTickCount% = FNSYS_NameToAddress("GetTickCount")
InvalidateRect% = FNSYS_NameToAddress("InvalidateRect")
Sleep% = FNSYS_NameToAddress("Sleep")
gridW% = ScrW%
gridH% = ScrH%
gridSz% = 8*gridW%*gridH% : REM Grid size in bytes
DIM hMap% gridSz%+8 : hMap% =(hMap% +7) AND -8
DIM hMap2% gridSz%+8 : hMap2%=(hMap2%+7) AND -8
DIM vMap% gridSz%+8 : vMap% =(vMap% +7) AND -8
colTabSz% = 10000
DIM colTab% 4*(colTabSz% + 1)
colTab%=(colTab%+3) AND -4
MaxExciters% = 8
DIM exciter{( MaxExciters%-1 ) active%, x%, y%, \
\ amp, theta, dtheta, life%, dying%}
PROCFillColourTable
PROCAssembleCode
REM Clear the grids
FOR I%=hMap% TO hMap%+gridSz%-1 STEP 8:|I% = 0.0:NEXT
FOR I%=hMap2% TO hMap2%+gridSz%-1 STEP 8:|I% = 0.0:NEXT
FOR I%=vMap% TO vMap%+gridSz%-1 STEP 8:|I% = 0.0:NEXT
dampTheta# = 0.0
dampDTheta# = 0.0001
dampAmp# = 0.05
scale# = 0.05
hDamp# = 0.1
first% = TRUE
frame% = 0
*REFRESH OFF
SYS GetTickCount% TO time0%
REPEAT
FOR I% = 0 TO MaxExciters%-1
IF exciter{(I%)}.active% THEN
X% = exciter{(I%)}.x%
Y% = exciter{(I%)}.y%
|(hMap% + 8*(Y%*gridW% + X%)) = \
\ 1.0# * exciter{(I%)}.amp * SIN(exciter{(I%)}.theta)
exciter{(I%)}.theta += exciter{(I%)}.dtheta
IF exciter{(I%)}.life% > 0 THEN
exciter{(I%)}.life%-=1
ELSE
exciter{(I%)}.dying% = TRUE
ENDIF
IF exciter{(I%)}.dying% THEN
exciter{(I%)}.amp -= 1
IF exciter{(I%)}.amp <= 0 THEN
exciter{(I%)}.active% = FALSE
|(vMap% + 8*(Y%*gridW% + X%)) = 0.0
ENDIF
ENDIF
ELSE
IF first%=TRUE OR RND(2500)=1 THEN
IF first% THEN first% = FALSE
exciter{(I%)}.active% = TRUE
exciter{(I%)}.x% = RND(gridW%)-2
exciter{(I%)}.y% = RND(gridH%)-2
exciter{(I%)}.amp = 100+RND(800)
exciter{(I%)}.theta = 0
IF RND(10) > 1 THEN
exciter{(I%)}.dtheta = 0.01 * RND(1)
ELSE
exciter{(I%)}.dtheta = 0.005 * RND(1)
ENDIF
exciter{(I%)}.life% = 500+RND(1000)
exciter{(I%)}.dying% = FALSE
ENDIF
ENDIF
NEXT I%
hDamp# = 1.0#*(0.01+dampAmp#*ABSSIN(dampTheta#))
dampTheta# += dampDTheta#
SYS Wave, gridW%, gridH%, hMap%, hMap2%, vMap%, ^scale#, ^hDamp#
SYS DWordCopy, hMap2%, hMap%, 2*gridW%*gridH%
CALL DrawHMap
SYS InvalidateRect%, @hwnd%, 0, 0
*REFRESH
IF Delay% THEN SYS Sleep%, 2
frame% += 1
SYS GetTickCount% TO time1%
IF time1%-time0% >= 1000 THEN
SYS "SetWindowText", @hwnd%, STR$frame% + " fps"
SYS GetTickCount% TO time0%
frame% = 0
ENDIF
UNTIL FALSE
END
DEF PROCFillColourTable
LOCAL I%,r%,g%,b%
LOCAL t1,t2,t3,t4,t5,t6
LOCAL dt1, dt2, dt3, dt4, dt5, dt6
t1=2*PI*RND(1):dt1=0.1*RND(1)
t2=2*PI*RND(1):dt2=0.1*RND(1)
t3=2*PI*RND(1):dt3=0.1*RND(1)
t4=2*PI*RND(1):dt4=0.1*RND(1)
t5=2*PI*RND(1):dt5=0.1*RND(1)
t6=2*PI*RND(1):dt6=0.1*RND(1)
FOR I% = 0 TO colTabSz%-1
r% = 128+127*SIN(t1)*SIN(t2)
g% = 128+127*SIN(t3)*SIN(t4)
b% = 128+127*SIN(t5)*SIN(t6)
colTab%!(4*I%) = r%*&10000 + g%*&100 + b%
t1+=dt1
t2+=dt2
t3+=dt3
t4+=dt4
t5+=dt5
t6+=dt6
NEXT I%
ENDPROC
DEF PROCAssembleCode
LOCAL P%, code%, pass%, gap1%, gap2%
LOCAL LC3, L1, L2, L3, L4, L5, L6, L7, L8
LOCAL L9, L10, L11, L12, L13, L14, L15, L16, L17
LOCAL fsgn
DIM gap1% 4095, code% 512, gap2% 4095
FOR pass%=0 TO 2 STEP 2
P%=code%
[OPT pass%
.DrawHMap
pushad
sub esp, 16
mov ebp, dibs%
mov esi, colTab%
finit
xor edx, edx ; Y loop index
.ylp
xor ecx, ecx ; X loop index
.xlp
mov ebx, edx ; copy Y
imul ebx, gridW% ; Y*gridW
add ebx, ecx ; Y*gridW + X
shl ebx, 3 ; 8*(Y*gridW + X)
add ebx, hMap% ; hMap%+8*(Y*gridW + X)
fld QWORD [ebx] ; = h
fistp DWORD [esp] ; = h%
mov edi, [esp] ; EDI = h%
add edi, colTabSz%DIV2 ; colTabSz%DIV2 + h%
mov eax, [esi + 4*edi] ; get colour
mov [ebp], eax
add ebp, 4
add ecx, 1
cmp ecx, gridW%
jl xlp
add edx, 1
cmp edx, gridH%
jl ylp
add esp, 16
popad
ret
.DWordCopy
; srcAddr, destAddr, numDWORDs
pushad
; ESP+36 = srcAddr
; ESP+40 = destAddr
; ESP+44 = numDWORDs
mov esi, [esp + 36]
mov edi, [esp + 40]
mov ecx, [esp + 44]
cld
rep movsd
popad
ret 12
;REM =========================================================
;REM The following assembler code was generated by the GCC
;REM (GNU C Compiler) with some modifications to make it
;REM compatible with BB4W's assembler
;REM =========================================================
.fsgn
fld QWORD PTR [esp+4]
mov eax, 1
fldz
fxch st1
db &DB : db (&E8 + 1) ;REM fucomi st0, st1
jbe L5
fstp st0
fstp st0
ret
.L5
fxch st1
xor eax, eax
db &DF : db (&E8 + 1) ;REM fucomip st0, st1
fstp st0
seta al
neg eax
ret
.Wave
push ebp
push edi
push esi
push ebx
sub esp, 36
mov edi, DWORD [esp+60]
mov eax, DWORD [esp+56]
mov edx, DWORD [esp+64]
mov DWORD [esp+8], 1
mov ebp, DWORD [esp+80]
sub edi, 2
fld DWORD [LC3]
mov DWORD [esp+20], edi
lea edi, [eax*8]
mov DWORD [esp+12], edx
mov edx, eax
sub eax, 2
mov DWORD [esp+4], eax
mov eax, DWORD [esp+20]
mov esi, edi
sal edx, 4
mov DWORD [esp+16], edi
add esi, DWORD [esp+72]
add edi, DWORD [esp+68]
cmp DWORD [esp+8], eax
mov DWORD [esp+24], edx
jge near L17
.L13
add DWORD [esp+8], 1
cmp DWORD [esp+4], 1
jle near L9
mov ebx, DWORD [esp+12]
mov eax, 1
mov edx, DWORD [esp+24]
fld st0
add edx, ebx
mov ecx, edx
mov DWORD [esp], edx
sub ecx, DWORD [esp+16]
jmp L12
.L14
fstp st1
fld1
.L10
fmul QWORD [ebp+0]
add ecx, 8
add DWORD [esp], 8
add ebx, 8
cmp eax, DWORD [esp+4]
fsubp st1, st0
fstp QWORD [edi+8*eax-8]
je L18
.L12
mov edx, DWORD [esp]
add eax, 1
fld QWORD [ecx+8]
fld QWORD [edx]
fadd QWORD [edx+8]
fadd QWORD [edx+16]
mov edx, DWORD [esp+76]
fadd QWORD [ecx]
fadd QWORD [ecx+16]
fadd QWORD [ebx]
fadd QWORD [ebx+8]
fadd QWORD [ebx+16]
fld st1
fmul st0, st3
fsubp st1, st0
fmul QWORD [edx]
fadd QWORD [esi+8*eax-8]
fst QWORD [esi+8*eax-8]
faddp st1, st0
fldz
fxch st1
db &DB : db (&E8 + 1) ;REM fucomi st0, st1
ja L14
fxch st1
xor edx, edx
db &DF : db (&E8 + 1) ;REM fucomip st0, st1
seta dl
neg edx
mov DWORD [esp+28], edx
fild DWORD [esp+28]
jmp L10
.L18
fstp st0
.L9
mov eax, DWORD [esp+16]
add DWORD [esp+12], eax
add edi, eax
add esi, eax
mov eax, DWORD [esp+20]
cmp DWORD [esp+8], eax
jl near L13
fstp st0
jmp L16
.L17
fstp st0
.L16
add esp, 36
pop ebx
pop esi
pop edi
pop ebp
ret
.LC3 dd 1090519040
]
NEXT pass%
ENDPROC
DEFFNCreateDIBSection
LOCAL A%,B%,H%,O%
DIM B% 19:!B%=44:B%!4=@vdu%!208:B%!8=@vdu%!212:B%!12=&200001
SYS"CreateDIBSection",@memhdc%,B%,0,^A%,0,0TOH%
IF H%=0 PROCError("Create DIBSection failed")
SYS"SelectObject",@memhdc%,H%TOO%
SYS"DeleteObject",O%
CLS
=A%
DEF FNSYS_NameToAddress(f$)
LOCALP%:DIMP%LOCAL5:[OPT 0:call f$:]:=P%!-4+P%
DEF PROCFixWndSz
LOCAL W%
SYS"GetWindowLong",@hwnd%,-16 TO W%
SYS"SetWindowLong",@hwnd%,-16,W% ANDNOT&40000 ANDNOT&10000
ENDPROC
DEF PROCError(s$)
OSCLI "REFRESH ON"
CLS : ON : VDU 7
PRINT '" " + s$;
REPEAT UNTIL INKEY(1)=0
ENDPROC
|
|
Logged
|
|
|
|
rtr
Guest
|
 |
Re: Letting GCC do the hard work
« Reply #1 on: Aug 2nd, 2014, 9:11pm » |
|
on Aug 2nd, 2014, 6:26pm, David Williams wrote:You should be able to copy & paste the following code into the BB4W IDE and run it. |
|
Perhaps worth mentioning that GCC's use of the fucomi and fucomip instructions means that the code requires a P6 (e.g. Pentium Pro) or later CPU, although it's pretty unlikely that anybody will still be running something older.
If this is an issue (or simply to avoid the DBs) you can encode equivalent instructions as follows:
Code: fucom st1 : push eax : fstsw ax : sahf : pop eax ;REM fucomi st0, st1
fucomp st1 : push eax : fstsw ax : sahf : pop eax ;REM fucomip st0, st1 Richard.
|
|
Logged
|
|
|
|
David Williams
Developer
member is offline

meh

Gender: 
Posts: 452
|
 |
Re: Letting GCC do the hard work
« Reply #2 on: Aug 2nd, 2014, 10:36pm » |
|
on Aug 2nd, 2014, 9:11pm, Richard Russell wrote: Code: fucom st1 : push eax : fstsw ax : sahf : pop eax ;REM fucomi st0, st1
fucomp st1 : push eax : fstsw ax : sahf : pop eax ;REM fucomip st0, st1 |
|
Thanks, noted. I can perhaps see why fucomi & fucomip are employed in the fsgn() function (returns as an integer the sign of a double), but they crop up again outside that function, in the inner loop, where no comparisons (in the source code) are performed (loop iterators excepted). GCC knows best, I suppose. It's quite brain-melting, I find, trying to understand GCC's generated assembler code.
I haven't yet substituted your equivalent encodings for fucomi & fucomip, but I'm a bit curious to see the likely time penalty (not that it matters because the code is not intended for real-time use; I'm generating frames for a YouTube video).
For completeness, for those that might be curious, I'll include my 'wet-behind-the-ears' C source below.
I think I'll be doing a fair amount of hybrid C/BB4W stuff over the next year or so, time permitting.
Thanks again.
David.
Code:/*
// A beginner's attempt at some C
//
// Not intended to be run 'standalone' (doesn't do anything)
// Intended that the resulting assembler code be exported to BB4W
*/
/* might be worth implementing fsgn() inline */
int fsgn(double n){
if (n > 0.0) return 1;
if (n < 0.0) return -1;
return 0;
}
void Wave(int W, int H, double *hMap, double *hMap2, double *vMap,
double *scale, double *hDamp){
/* W and H are the grid/map width and height respectively */
int x, y, yp1w, ym1w, yw, ywpx, xm1, xp1;
double hSum, sumHDiff, h, v;
hSum = 0.0;
for (y=1; y<H-2; y++){
/* pre-calc. some frequently accessed values (array indices) */
yp1w = (y+1)*W;
ym1w = (y-1)*W;
yw = y*W;
for (x=1; x<W-2; x++){
/* a few more pre-calculated array indices */
xm1 = x-1;
xp1 = x+1;
ywpx = yw + x;
/*
// calculate the sum of the 8 height values
// surrounding the current x,y position in the height map (hMap)
*/
hSum = hMap[ yp1w + xm1 ]
+ hMap[ yp1w + x ]
+ hMap[ yp1w + xp1 ]
+ hMap[ yw + xm1 ]
+ hMap[ yw + xp1 ]
+ hMap[ ym1w + xm1 ]
+ hMap[ ym1w + x ]
+ hMap[ ym1w + xp1 ];
/* get height value at current x,y position */
h = hMap[ ywpx ];
/* calculate the sum of the height differences */
sumHDiff = hSum - 8*h;
/* retrieve and update the 'velocity' (i.e. change in height) */
v = vMap[ ywpx ] + (*scale)*sumHDiff;
/* store updated 'velocity' */
vMap[ ywpx ] = v;
/* update the height, apply damping, and store it in hMap2 */
hMap2[ ywpx ] = h+v - fsgn(h+v)*(*hDamp);
}
}
return;
}
main(){
return;
}
|
|
Logged
|
|
|
|
rtr
Guest
|
 |
Re: Letting GCC do the hard work
« Reply #3 on: Aug 3rd, 2014, 10:19am » |
|
on Aug 2nd, 2014, 10:36pm, David Williams wrote:I can perhaps see why fucomi & fucomip are employed in the fsgn() function (returns as an integer the sign of a double), but they crop up again outside that function, in the inner loop |
|
I think you're misreading the generated assembler code. The fsgn function is never called and can be deleted without affecting the operation of your program! The use of fucomi and fucomip in the 'inner loop' are where GCC has automatically inlined the fsgn code, for performance reasons.
Incidentally this is a common C implementation of the signum function:
Code:int fsgn(double val) {return (val > 0.0) - (val < 0.0);} It relies on the fact that comparisons return 1 for true and 0 for false, which is guaranteed. I don't know whether the generated assembler code will be any simpler than for your version.
Quote:I haven't yet substituted your equivalent encodings for fucomi & fucomip, but I'm a bit curious to see the likely time penalty |
|
In human-written assembler code one would try to avoid the need to save and restore eax, which makes the overhead that much greater.
There is an argument that ASMLIB should have included fucomi and the other comparison instructions that were added to the Pentium Pro, but nobody has ever commented on the omission, or asked me to correct it.
Richard.
|
|
Logged
|
|
|
|
David Williams
Developer
member is offline

meh

Gender: 
Posts: 452
|
 |
Re: Letting GCC do the hard work
« Reply #4 on: Aug 10th, 2014, 8:49pm » |
|
I'm pleased to say that, with the WavePlasma6 program, when the C source code (containing the subroutine that does the calculations - 'Wave') is compiled to a DLL, the performance really doesn't take much of a hit (both versions give around 80 to 90 fps on my laptop).
The only explicit GCC optimisation switch I'm specifying is -O2, but if anyone can suggest any others then please go ahead. (Without the -O2 switch, it runs at around 40 fps on my laptop - so nearly half the speed.)
For interested parties, the "DLL version" of WavePlasma6 is listed below, and the DLL itself (wave1d.dll) can be downloaded from here:
www.bb4wgames.com/temp/wave1d_dll.zip
Code:
REM WavePlasma6b // 10-08-2014
REM
REM Requires wave1d.dll
*ESC OFF
*FLOAT 64
ON CLOSE PROC_clean_up : QUIT
ON ERROR PROC_clean_up : PROCError(REPORT$ + " at line " + STR$ERL)
HIMEM = PAGE + 10*&100000
SYS "LoadLibrary", "wave1d" TO wave1d_dll%
IF wave1d_dll% = 0 PROCError("Can't load wave1d.dll (LoadLibrary returned 0)")
SYS "GetProcAddress", wave1d_dll%, "Wave" TO Wave%
IF Wave% = 0 PROC_clean_up : PROCError("Couldn't import the DLL function 'Wave' (GetProcAddress returned 0)")
R% = RND(-50681821) : REM Seed BB4W's PRNG
Delay% = TRUE
ScrW% = 512
ScrH% = 512
PROCFixWndSz
VDU 23,22,ScrW%;ScrH%;8,16,16,0 : OFF
dibs% = FNCreateDIBSection
GetTickCount% = FNSYS_NameToAddress("GetTickCount")
InvalidateRect% = FNSYS_NameToAddress("InvalidateRect")
Sleep% = FNSYS_NameToAddress("Sleep")
gridW% = ScrW%
gridH% = ScrH%
gridSz% = 8*gridW%*gridH% : REM Grid size in bytes
DIM hMap% gridSz%+8 : hMap% =(hMap% +7) AND -8
DIM hMap2% gridSz%+8 : hMap2%=(hMap2%+7) AND -8
DIM vMap% gridSz%+8 : vMap% =(vMap% +7) AND -8
colTabSz% = 10000
DIM colTab% 4*(colTabSz% + 1)
colTab%=(colTab%+3) AND -4
MaxExciters% = 8
DIM exciter{( MaxExciters%-1 ) active%, x%, y%, \
\ amp, theta, dtheta, life%, dying%}
PROCFillColourTable
PROC_asm
REM Clear the grids
FOR I%=hMap% TO hMap%+gridSz%-1 STEP 8:|I% = 0.0:NEXT
FOR I%=hMap2% TO hMap2%+gridSz%-1 STEP 8:|I% = 0.0:NEXT
FOR I%=vMap% TO vMap%+gridSz%-1 STEP 8:|I% = 0.0:NEXT
dampTheta# = 0.0
dampDTheta# = 0.0001
dampAmp# = 0.05
scale# = 0.05
hDamp# = 0.1
first% = TRUE
frame% = 0
*REFRESH OFF
SYS GetTickCount% TO time0%
REPEAT
FOR I% = 0 TO MaxExciters%-1
IF exciter{(I%)}.active% THEN
X% = exciter{(I%)}.x%
Y% = exciter{(I%)}.y%
|(hMap% + 8*(Y%*gridW% + X%)) = \
\ 1.0# * exciter{(I%)}.amp * SIN(exciter{(I%)}.theta)
exciter{(I%)}.theta += exciter{(I%)}.dtheta
IF exciter{(I%)}.life% > 0 THEN
exciter{(I%)}.life%-=1
ELSE
exciter{(I%)}.dying% = TRUE
ENDIF
IF exciter{(I%)}.dying% THEN
exciter{(I%)}.amp -= 1
IF exciter{(I%)}.amp <= 0 THEN
exciter{(I%)}.active% = FALSE
|(vMap% + 8*(Y%*gridW% + X%)) = 0.0
ENDIF
ENDIF
ELSE
IF first%=TRUE OR RND(2500)=1 THEN
IF first% THEN first% = FALSE
exciter{(I%)}.active% = TRUE
exciter{(I%)}.x% = RND(gridW%)-2
exciter{(I%)}.y% = RND(gridH%)-2
exciter{(I%)}.amp = 100+RND(800)
exciter{(I%)}.theta = 0
IF RND(10) > 1 THEN
exciter{(I%)}.dtheta = 0.01 * RND(1)
ELSE
exciter{(I%)}.dtheta = 0.005 * RND(1)
ENDIF
exciter{(I%)}.life% = 500+RND(1000)
exciter{(I%)}.dying% = FALSE
ENDIF
ENDIF
NEXT I%
hDamp# = 1.0#*(0.01+dampAmp#*ABSSIN(dampTheta#))
dampTheta# += dampDTheta#
SYS Wave%, gridW%, gridH%, hMap%, hMap2%, vMap%, ^scale#, ^hDamp#
SYS DWordCopy, hMap2%, hMap%, 2*gridW%*gridH%
CALL DrawHMap
SYS InvalidateRect%, @hwnd%, 0, 0
*REFRESH
IF Delay% THEN SYS Sleep%, 2
frame% += 1
SYS GetTickCount% TO time1%
IF time1%-time0% >= 1000 THEN
SYS "SetWindowText", @hwnd%, STR$frame% + " fps"
SYS GetTickCount% TO time0%
frame% = 0
ENDIF
UNTIL FALSE
END
DEF PROCFillColourTable
LOCAL I%,r%,g%,b%
LOCAL t1,t2,t3,t4,t5,t6
LOCAL dt1, dt2, dt3, dt4, dt5, dt6
t1=2*PI*RND(1):dt1=0.1*RND(1)
t2=2*PI*RND(1):dt2=0.1*RND(1)
t3=2*PI*RND(1):dt3=0.1*RND(1)
t4=2*PI*RND(1):dt4=0.1*RND(1)
t5=2*PI*RND(1):dt5=0.1*RND(1)
t6=2*PI*RND(1):dt6=0.1*RND(1)
FOR I% = 0 TO colTabSz%-1
r% = 128+127*SIN(t1)*SIN(t2)
g% = 128+127*SIN(t3)*SIN(t4)
b% = 128+127*SIN(t5)*SIN(t6)
colTab%!(4*I%) = r%*&10000 + g%*&100 + b%
t1+=dt1
t2+=dt2
t3+=dt3
t4+=dt4
t5+=dt5
t6+=dt6
NEXT I%
ENDPROC
DEF PROC_asm
LOCAL P%, code%, pass%, gap1%, gap2%
LOCAL xlp, ylp
DIM gap1% 4095, code% 1023, gap2% 4095
FOR pass%=0 TO 2 STEP 2
P%=code%
[OPT pass%
.DrawHMap
pushad
sub esp, 16
mov ebp, dibs%
mov esi, colTab%
finit
xor edx, edx ; Y loop index
.ylp
xor ecx, ecx ; X loop index
.xlp
mov ebx, edx ; copy Y
imul ebx, gridW% ; Y*gridW
add ebx, ecx ; Y*gridW + X
shl ebx, 3 ; 8*(Y*gridW + X)
add ebx, hMap% ; hMap%+8*(Y*gridW + X)
fld QWORD [ebx] ; = h
fistp DWORD [esp] ; = h%
mov edi, [esp] ; EDI = h%
add edi, colTabSz%DIV2 ; colTabSz%DIV2 + h%
mov eax, [esi + 4*edi] ; get colour
mov [ebp], eax
add ebp, 4
add ecx, 1
cmp ecx, gridW%
jl xlp
add edx, 1
cmp edx, gridH%
jl ylp
add esp, 16
popad
ret
.DWordCopy
; srcAddr, destAddr, numDWORDs
pushad
; ESP+36 = srcAddr
; ESP+40 = destAddr
; ESP+44 = numDWORDs
mov esi, [esp + 36]
mov edi, [esp + 40]
mov ecx, [esp + 44]
cld
rep movsd
popad
ret 12
]
NEXT pass%
ENDPROC
DEFFNCreateDIBSection
LOCAL A%,B%,H%,O%
DIM B% 19:!B%=44:B%!4=@vdu%!208:B%!8=@vdu%!212:B%!12=&200001
SYS"CreateDIBSection",@memhdc%,B%,0,^A%,0,0TOH%
IF H%=0 PROCError("Create DIBSection failed")
SYS"SelectObject",@memhdc%,H%TOO%
SYS"DeleteObject",O%
CLS
=A%
DEF FNSYS_NameToAddress(f$)
LOCALP%:DIMP%LOCAL5:[OPT 0:call f$:]:=P%!-4+P%
DEF PROCFixWndSz
LOCAL W%
SYS"GetWindowLong",@hwnd%,-16 TO W%
SYS"SetWindowLong",@hwnd%,-16,W% ANDNOT&40000 ANDNOT&10000
ENDPROC
DEF PROC_clean_up
wave1d_dll% += 0
IF wave1d_dll% <> 0 THEN SYS "FreeLibrary", wave1d_dll%
ENDPROC
DEF PROCError(s$)
OSCLI "REFRESH ON"
CLS : ON : VDU 7
PRINT '" " + s$;
REPEAT UNTIL INKEY(1)=0
ENDPROC
David. --
|
|
Logged
|
|
|
|
David Williams
Developer
member is offline

meh

Gender: 
Posts: 452
|
 |
Re: Letting GCC do the hard work
« Reply #5 on: Aug 11th, 2014, 9:30pm » |
|
As part of my familiarization with C (with a view to writing hybrid C and BB4W programs), I revisited an eccentric old past-time of mine: searching for circular alignments (and other geometric shapes) amongst sets of randomly positioned points. Ten years ago when I was into this stuff, it would take BBC BASIC several minutes to search a 100-or-so random 'targets'. Now it takes around 20 seconds on my laptop (in BASIC). Admittedly, the circle-finding algorithm is not robust and nor is it the most efficient way of looking for circles (that would probably be some variant of the Hough Transform - a linear-time search).
Anyway, the compiled C version (after conversion to BB4W assembler code) takes ~0.2 seconds to search 100 random points for circles, some 80 to 100 times faster than the BASIC version. The DLL version takes around 1 second, some 4-5 times slower than the ASM version, but still significantly faster than the BASIC version.
The following link to a Zip folder contains different versions of the circle finder (BASIC, assembly language, DLL, C source):
http://www.bb4wgames.com/temp/circfinder.zip
The assembly language version makes (minor) use of ASMLIB (because GCC compiler emitted a CMOVxx instruction).
Parameters used (the most important ones):
Number of random (x,y) points: 100 Grid size: 20km x 16km Minimum 'hits' required per circle: 8 Error tolerance: 60 metres Minimum allowed radius: 2000 metres Maximum allowed radius: 6000 metres
Note that none of this is based on final, 'production quality' code. I am, after all, merely learning C.
I think a "GCC assembler dump to BB4W assembler code converter" would save a lot of time, so perhaps that's a new project for me at some point.
David. --
|
|
Logged
|
|
|
|
yee
New Member
member is offline


Posts: 4
|
 |
Re: Letting GCC do the hard work
« Reply #6 on: Aug 12th, 2014, 8:17pm » |
|
Hi, everyone,
Just a note to say, - some years ago, I came across an article "Easy C " by pete Orlin and John Heath in may 1985 issue of Byte magazine (p 137-148)
it describes their use of the C's preprocesor to make their C code scripts - read a bit like BASIC code which one may find easier to "understand" for the beginner and infrequent C user ?
(I can e-mail a copy if you can't search/find a copy )
|
|
Logged
|
|
|
|
David Williams
Developer
member is offline

meh

Gender: 
Posts: 452
|
 |
Re: Letting GCC do the hard work
« Reply #7 on: Aug 17th, 2014, 08:13am » |
|
Probably my last post for a while...
1000 depth-sorted 'vector balls' based on graphics routines written in C (including the Shell Sort code I borrowed from Rosetta Code). I get 60 fps on my laptop, which is quite impressive, considering (well, considering that I don't yet really know what I'm doing with C):
www.bb4wgames.com/temp/vector_balls.zip [EXE; 142 Kb]
Update: Two versions of glib.dll compiled using different GCC optimisation settings (-O2 and -O3 plus -ffast-math). Also edited this post to include the image link below.
Screenshot: www.bb4wgames.com/temp/vecballs.jpg
This kind of performance is convincing me that hybridizing BB4W and C/C++ code is the way to go (for me personally).
I'll include the BB4W part of the source below for curious people.
David. ---
Code:
*FLOAT 64
*ESC OFF
ON ERROR PROCerror
HIMEM = PAGE + 2*&100000
PROCFixWndSz : MODE 8 : OFF
INSTALL @lib$ + "GLIB"
PROCInitGLIB( @lib$ + "glib.dll", g{} )
ON ERROR PROCCleanup : PROCerror
ON CLOSE PROCCleanup : QUIT
GetTickCount = FN`s("GetTickCount")
LerpClr = FNImport("LerpClr")
Plot = FNImport("Plot")
InitExPoints = FNImport("InitExPointList")
ShellSort = FNImport("ShellSortExPointListZValues")
Rotate = FNImport("RotateExPoints")
MakeBallBitmap = FNImport("MakeBallBitmap1")
REM Create a 48x48 ball bitmap:
DIM ball% 4*(48*48 + 1)
ball%=ball% + 3 AND -4
SYS MakeBallBitmap, ball%, 48, &40AA20, &FF0020, 0.99*&10000, &10000
N% = 1000
DIM list{(N%-1) x#, y#, z#, x2#, y2#, z2#, key%, d0%, d1%, d2% }
listBaseAddr% = ^list{(0)}.x#
IF (listBaseAddr% AND 3) <> 0 THEN
PRINT '" The coordinates list base address is not DWORD-aligned!"
PRINT '" This may affect performance. Continuing in 5 seconds..."
WAIT 500
ENDIF
REM Define objects (balls) 3D (x,y,z) coordinates:
FOR I% = 0 TO N%-1
list{(I%)}.x# = (RND(1)-0.5) * 800.0
list{(I%)}.y# = (RND(1)-0.5) * 800.0
list{(I%)}.z# = (RND(1)-0.5) * 800.0
list{(I%)}.x2# = 0.0
list{(I%)}.y2# = 0.0
list{(I%)}.z2# = 0.0
NEXT
REM Initialise rotation angles:
t1 = 2*PI*RND(1)
t2 = 2*PI*RND(1)
t3 = 2*PI*RND(1)
F% = &10000 : REM Frequently used constant
frames% = 0
*REFRESH OFF
SYS GetTickCount TO time0%
REPEAT
REM Draw background:
SYS LerpClr, g{}, &102030, &805090
REM Init key values for Z-sort
REM (this could and perhaps should be moved into the rotation routine):
SYS InitExPoints, listBaseAddr%, N%
REM Rotate coordinates:
SYS Rotate, listBaseAddr%, 1+2+4, N%, F%*t1, F%*t2, F%*t3, \
\ F%*-100, F%*-200, F%*50, \
\ F%*320, F%*256, F%*0, \
\ 1, F%*300, F%*800
REM Shell sort code (in C) courtesy of Rosetta Code, many thanks:
SYS ShellSort, listBaseAddr%, N%, -1
REM Draw the depth-sorted balls:
FOR I% = 0 TO N%-1
J% = list{(I%)}.key%
SYS Plot, g{}, ball%, 48, 48, list{(J%)}.x2#-16, list{(J%)}.y2#-16
NEXT I%
PROCDisplay( TRUE )
frames% += 1
SYS GetTickCount TO time1%
IF time1%-time0%>=1000 THEN
SYS GetTickCount TO time0%
SYS "SetWindowText", @hwnd%, STR$frames% + " fps"
frames% = 0
ENDIF
REM Bump rotation angles:
t1 += 0.02001
t2 += 0.01905
t3 += 0.00598
UNTIL FALSE
DEF PROCFixWndSz
LOCAL W%
SYS"GetWindowLong",@hwnd%,-16 TO W%
SYS"SetWindowLong",@hwnd%,-16,W% ANDNOT&40000 ANDNOT&10000
ENDPROC
DEF PROCerror
OSCLI "REFRESH ON" : CLS : ON : PRINT '" ";
REPORT : PRINT " at line "; ERL;
REPEAT UNTIL INKEY(1)=0
ENDPROC
|
|
|
|
rtr
Guest
|
 |
Re: Letting GCC do the hard work
« Reply #8 on: Aug 17th, 2014, 10:32am » |
|
on Aug 17th, 2014, 08:13am, David Williams wrote:This kind of performance is convincing me that hybridizing BB4W and C/C++ code is the way to go (for me personally). |
|
Performance (if by that you mean speed) is not a good reason to go down the GCC route. Even though the code generators in modern compilers are very good, you will almost always be able to do better with hand-crafted assembler.
That is especially true when you are not targeting a particular CPU architecture, but want the code to run on a wide range of machines. In that case much of the clever code-optimising for a specific architecture, that GCC can do very well, will benefit some machines at the expense of others.
If you are compiling with the -march=native switch and then testing your code on the same machine you are getting a misleading impression of performance (unless of course you want to go down the route of including machine code for a range of different architectures and choosing the best one at run time).
Where using C does admittedly have advantages is in speed and ease of coding, and especially in time taken debugging. If those are the issues that most concern you, then fine.
Richard.
|
|
Logged
|
|
|
|
David Williams
Developer
member is offline

meh

Gender: 
Posts: 452
|
 |
Re: Letting GCC do the hard work
« Reply #9 on: Aug 17th, 2014, 11:38am » |
|
on Aug 17th, 2014, 10:32am, Richard Russell wrote:If you are compiling with the -march=native switch and then testing your code on the same machine you are getting a misleading impression of performance [...] |
|
Yes, I did use that switch and when after uploading the EXE for public consumption, I discovered that it crashed my 32-bit XP-based laptop (which I hardly use now!). I suspect my use of -march=native caused GCC to generate 64-bit code since the laptop the code was compiled on is a 64-bit machine. Lesson learned.
Re-compiling the vector balls demo without the aforementioned switch results in the code working on the 32-bit laptop, although the frame rate isn't as high as on the compilation machine (which is a little faster anyway, I think).
Quote:Where using C does admittedly have advantages is in speed and ease of coding, and especially in time taken debugging. If those are the issues that most concern you, then fine. |
|
For the vast majority of my 'applications', the speed of GCC's generated ASM code suffices (and in some cases, has exceeded the speed of my hand-written ASM code, which isn't too surprising!). I won't be touching -- or rather, writing -- assembler code again unless my life depends on it.
David. --
|
|
Logged
|
|
|
|
|