INSTALL @lib$+"ASMLIB2"
DIM crossterms 15+15, result 15+15, gap% 2047, P% 120
crossterms = (crossterms + 15) AND -16
result = (result + 15) AND -16 : REM align
ON ERROR LOCAL [OPT FN_asmext : ]
[
.mul
movq xmm0, [^x%%] ; xmm0 = { 0, 0, A, B } = { *, *, A, B }
movq xmm1, [^y%%] ; xmm1 = { 0, 0, C, D } = { *, *, C, D }
punpckldq xmm0, xmm0 ; xmm0 = { A, A, B, B } = { *, A, *, B }
punpckldq xmm1, xmm1 ; xmm1 = { C, C, D, D } = { *, C, *, D }
pshufd xmm2, xmm1, %10001101 ; xmm2 = { D, D, C, C } = { *, D, *, C }
pmuludq xmm1, xmm0 ; xmm1 = { AC, BD } // provisional result
pmuludq xmm2, xmm0 ; xmm2 = { AD, BC } // cross-terms
movdqa [result], xmm1
movdqa [crossterms], xmm2
mov eax, crossterms[0]
mov edx, crossterms[4] ; edx|eax = BC
add result[4], eax
adc result[8], edx
adc dword result[12], 0 ; add the first cross-term
mov eax, crossterms[8]
mov edx, crossterms[12] ; edx|eax = AD
add result[4], eax
adc result[8], edx
adc dword result[12], 0 ; add the second cross-term
ret
]
RESTORE ERROR
*hex64
x%% = &1234567812345678
y%% = &8765432187654321
*hex32
CALL mul
PRINT ~result!12, ~result!8, ~result!4, ~result!0