bug in SSE math

revelator · February 10, 2015

after some discussion i went over the SSE math in idlib and to my horror i actually uncovered a bug which might actually be the cause of some of the problems we have had with floating point precision.

// operate on two float arrays
#define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT )		\
	int	pre,post;										\
	KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post )	\
	__asm		and		eax,15							\
	__asm		jne		lpNA							\
	__asm		jmp		lpA								\
	__asm		align	16								\
	__asm	lpA:										\
	__asm		movaps	xmm1,[edx+ebx]					\
	__asm		movaps	xmm2,[edx+ebx+16]				\
	__asm		ALUOP##ps	xmm1,[esi+ebx]				\
	__asm		ALUOP##ps	xmm2,[esi+ebx+16]			\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		prefetchnta	[esi+ebx+64]				\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpA								\
	__asm		jmp		done							\
	__asm		align	16								\
	__asm	lpNA:										\
	__asm		movups	xmm1,[edx+ebx]					\
	__asm		movups	xmm2,[edx+ebx+16]				\
	__asm		movups	xmm3,[esi+ebx]					\
	__asm		movups	xmm4,[esi+ebx+16]				\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		prefetchnta	[esi+ebx+64]				\
	__asm		ALUOP##ps	xmm1,xmm3					\
	__asm		ALUOP##ps	xmm2,xmm4					\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpNA							\
	__asm	done:										\
	__asm		mov		edx,SRC0						\
	__asm		mov		esi,SRC1						\
	__asm		mov		edi,DST							\
	__asm		KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ),		\
	__asm		KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )

the above is the fixed function.

heres what it looked like earlier

// operate on two float arrays
#define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT )		\
	int	pre,post;										\
	KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post )	\
	__asm		and		eax,15							\
	__asm		jne		lpNA							\
	__asm		jmp		lpA								\
	__asm		align	16								\
	__asm	lpA:										\
	__asm		movaps	xmm1,[edx+ebx]					\
	__asm		movaps	xmm2,[edx+ebx+16]				\
	__asm		ALUOP##ps	xmm1,[esi+ebx]				\
	__asm		ALUOP##ps	xmm2,[esi+ebx+16]			\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		prefetchnta	[esi+ebx+64]				\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpA								\
	__asm		jmp		done							\
	__asm		align	16								\
	__asm	lpNA:										\
	__asm		movups	xmm1,[edx+ebx]					\
	__asm		movups	xmm2,[edx+ebx+16]				\
	__asm		movups	xmm3,[esi+ebx]					\
	__asm		movups	xmm4,[esi+ebx+16]				\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		prefetchnta	[esi+ebx+64]				\
	__asm		ALUOP##ps	xmm1,xmm3					\
	__asm		ALUOP##ps	xmm2,xmm4					\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpNA							\
	__asm	done:										\
	__asm		mov		edx,SRC0						\
	__asm		mov		esi,SRC1						\
	__asm		mov		edi,DST							\
			KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ),		\
			KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )

notice the lack of the __asm keyword in the macro calls.

theres a similar function above this macro

// operate on a constant and a float array
#define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT )	\
	int	pre,post;										\
	__asm		movss	xmm0,CONSTANT					\
	__asm		shufps	xmm0,xmm0,0						\
	KFLOATINITDS( DST, SRC, COUNT, pre, post )			\
	__asm		and		eax,15							\
	__asm		jne		lpNA							\
	__asm		jmp		lpA								\
	__asm		align	16								\
	__asm	lpA:										\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		movaps	xmm1,xmm0						\
	__asm		movaps	xmm2,xmm0						\
	__asm		ALUOP##ps	xmm1,[edx+ebx]				\
	__asm		ALUOP##ps	xmm2,[edx+ebx+16]			\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpA								\
	__asm		jmp		done							\
	__asm		align	16								\
	__asm	lpNA:										\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		movaps	xmm1,xmm0						\
	__asm		movaps	xmm2,xmm0						\
	__asm		movups	xmm3,[edx+ebx]					\
	__asm		movups	xmm4,[edx+ebx+16]				\
	__asm		ALUOP##ps	xmm1,xmm3					\
	__asm		ALUOP##ps	xmm2,xmm4					\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpNA							\
	__asm	done:										\
	__asm		mov		edx,SRC							\
	__asm		mov		edi,DST							\
	__asm		KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ),	\
	__asm					KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )

notice the KFLOATOPER macro... whoops

Obsttorte · February 10, 2015

Shut up and take my like

I'm not familiar with this kind of coding so it's hardly to understand what's the point, but if it helps your welcome. *thumbs up*

SteveL · February 10, 2015

I don't understand... either the code or the problem! Wouldn't syntax errors cause the compiler to choke rather than produce bad results at runtime?

// operate on two float arrays
#define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT )		\
...
	__asm		KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ),		\
	__asm		KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )

the above is the fixed function.

heres what it looked like earlier

// operate on two float arrays
#define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT )		\
...
			KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ),		\
			KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )

notice the lack of the __asm keyword in the macro calls.

The KFLOATOPER macro begins with an __asm, so it probably doesn't need one preceding it where it's used.

Also note the line that begins KALUDSS4 is the ssecond paramter to the previous macro call -- we're still inside its open bracket -- it's not a new instruction.

theres a similar function above this macro

// operate on a constant and a float array
#define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT )	\
...					\
	__asm		KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ),	\
	__asm					KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )

notice the KFLOATOPER macro... whoops

Is the "whoops" the same point I mentioned above, that the KALUDSS4 is the second parameter not a new instruction, or was it something different? I don't understand why this code compiles, I admit. I've never worked on embedded assembly. My first thought was that a double __asm __asm must be legal code, but the microsoft documentation says not.

___

The floating point precision problem that we found is in traditional c++ code, because of a big change in the way VS2012+ handles intermediate float results. It truncates them to 32 bits instead of using 80-bit intermediates. ASM code shouldn't be affected by that problem because its precision is explicit, so the compiler doesn't get to choose something different. Here's an example that demonstraes the problem, using real numbers that were causing a problem in TDM when calculating the part of a VP that the player can see through:

I found the visportal problem.

My result suggests that the 2010 compiler is using higher than 32-bit precision for the intermediate results of floating-point calculations, while the 2013 build truncates those results to normal floats. And the engine relies on the higher precision in some places.

Demo code:
float a = -272.514099f;
float b = -272.514160f;
float c = -93.7499619f;
float d = -93.7499771f;

float e = d * a; // Result: 25548.1914
float f = b * c; // Result: 25548.1914

float g = e - f; // Result: 0.0f
float h = d * a - b * c; // Result in 2013: 0. Result in 2010: -0.00156380842
That's a real life example from my InnBiz test map. The sum comes from idVec3::Cross(), which is used to calculate the normal for a new plane, and the 'h' in my example is the y-component of the new plane normal. By the time you've normalized the resulting vector and fit the plane through the view origin, that small difference has lowered the visportal scissor rectangle by 15 doom units. The other components had slightly bigger errors too, but this was the best illustration because the outcome was non-0 vs 0.

The precision problem only messes things up where you cross two near-parallel vectors. But that happens, of course, including in this situation where the player looks through 2 visportals whose corners appear to be nearly touching given the player's viewpoint.

revelator · February 10, 2015

Aye its a bit hard to wrap the head around reason i found out that it was wrong was because i formatted my source and what was previously a single line of code was formatted to now be escaped into two

lines.

This macro KFLOATOPER funny enough it compiled but i noticed a ton of warnings about non standard syntax suddenly from microsofts intellisense, so i had a look at the function and noticed that compared to the above macro the end bit was missing all the __asm keywords and the macro before that which had a similar end had them in two places.

I think what happened was that the dev intended one __asm call on the end function in those macros but put the second one in the wrong spot in the macro above that allready had an __asm keyword defined.

So it should actually look like this i think.

// operate on a constant and a float array
#define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT )	\
	int	pre,post;										\
	__asm		movss	xmm0,CONSTANT					\
	__asm		shufps	xmm0,xmm0,0						\
	KFLOATINITDS( DST, SRC, COUNT, pre, post )			\
	__asm		and		eax,15							\
	__asm		jne		lpNA							\
	__asm		jmp		lpA								\
	__asm		align	16								\
	__asm	lpA:										\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		movaps	xmm1,xmm0						\
	__asm		movaps	xmm2,xmm0						\
	__asm		ALUOP##ps	xmm1,[edx+ebx]				\
	__asm		ALUOP##ps	xmm2,[edx+ebx+16]			\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpA								\
	__asm		jmp		done							\
	__asm		align	16								\
	__asm	lpNA:										\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		movaps	xmm1,xmm0						\
	__asm		movaps	xmm2,xmm0						\
	__asm		movups	xmm3,[edx+ebx]					\
	__asm		movups	xmm4,[edx+ebx+16]				\
	__asm		ALUOP##ps	xmm1,xmm3					\
	__asm		ALUOP##ps	xmm2,xmm4					\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpNA							\
	__asm	done:										\
	__asm		mov		edx,SRC							\
	__asm		mov		edi,DST							\
	__asm		KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ),	   \
							KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )

// operate on two float arrays
#define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT )		\
	int	pre,post;										\
	KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post )	\
	__asm		and		eax,15							\
	__asm		jne		lpNA							\
	__asm		jmp		lpA								\
	__asm		align	16								\
	__asm	lpA:										\
	__asm		movaps	xmm1,[edx+ebx]					\
	__asm		movaps	xmm2,[edx+ebx+16]				\
	__asm		ALUOP##ps	xmm1,[esi+ebx]				\
	__asm		ALUOP##ps	xmm2,[esi+ebx+16]			\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		prefetchnta	[esi+ebx+64]				\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpA								\
	__asm		jmp		done							\
	__asm		align	16								\
	__asm	lpNA:										\
	__asm		movups	xmm1,[edx+ebx]					\
	__asm		movups	xmm2,[edx+ebx+16]				\
	__asm		movups	xmm3,[esi+ebx]					\
	__asm		movups	xmm4,[esi+ebx+16]				\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		prefetchnta	[esi+ebx+64]				\
	__asm		ALUOP##ps	xmm1,xmm3					\
	__asm		ALUOP##ps	xmm2,xmm4					\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpNA							\
	__asm	done:										\
	__asm		mov		edx,SRC0						\
	__asm		mov		esi,SRC1						\
	__asm		mov		edi,DST							\
	__asm		KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ),		\
							KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )

SteveL · February 10, 2015

Agreed, only one of those two functions can have been right. I suspect the correct one was the KFLOAT_AA function, with no __asm at all for the KFLOATOPER, because KFLOATOPER has __asm as its first token, so after the preprocessor has done its job, the KFLOAT_CA function will end up with __asm __asm by the time the compiler gets to see the code.

I'm hoping to tap you for advice later on implementing GLEW. Now that 2.03 is out the door I plan to go and find the relevant commits in your git repo. We want to use some OpenGL3 features in TDM 2.04, so I'm wondering whether an externally managed library could save us a lot of trouble. I'll do my homework finding your code then ask in your branch thread!

revelator · February 10, 2015

Should save you a lot of work GLEW supports upto OpenGL 4.2 so that should do nicely.

And your welcome to contact me for help if something seems unclear.

As for the SSE bug i actually found that you can kill both the __asm calls before the macros and it still works atleast i havent seen anything odd yet.

Im a bit surprised about this as you might guess but removing the above two seems to not trigger intellisense either and it builds without warnings hmm ?!?.

revelator · February 10, 2015

Also im a bit stumped why id's devs created those macros with an asm call for every single line when you can just do __asm { tons of assembler keywords, another ton of assembler keywords, etc }

and get away with a single __asm line Oo

SteveL · February 10, 2015

Also im a bit stumped why id's devs created those macros with an asm call for every single line when you can just do __asm { tons of assembler keywords, another ton of assembler keywords, etc }
and get away with a single __asm line Oo

I think I can answer that one... you can't capture multiple lines in a { block } in a #define. The #define reads the above macros as a single line of code because they all end with \, and the __asm keywords are acting as statement separators without triggering the end of the #define.

It's apparently illegal syntax to have two __asm in a row with nothing in between so I guess that's where the intellisense errors came from: an __asm before the macro name in KFLOAT_CA, and another provided by the macro itself as its first token.

Should save you a lot of work GLEW supports upto OpenGL 4.2 so that should do nicely.
And your welcome to contact me for help if something seems unclear.

I was hoping you'd say that. We want to try out techniques needing frame buffer objects, uniform buffer objects, and instancing next. All OGL3. I have dozens of questions but I'll go take a look what you did with GLEW this evening, hopefully to gather answers to most of them and shorten the list a lot!

revelator · February 10, 2015

GLEW is pretty easy to add as a replacement for the hardcoded api calls in Vanilla theres one 'but' though the old gllog functions need to go as they are incompatible with GLEW.

Should go anyway since they newer worked to begin with and they are a leftover from idtech3 which used OpenGL 1.1 functions a lot more than Vanilla does.

Besides that you need to rearrange a few calls in win_glimp.cpp (two to be exact i think), else its easy going .

And thanks for the heads up about the shortcomming of macros, that one i did not know makes one wonder if a function would have been more appropriate.

SteveL · February 10, 2015

Great, thanks, time for me to get reading code

theres one 'but' though the old gllog functions need to go as they are incompatible with GLEW.
Should go anyway since they newer worked to begin with and they are a leftover from idtech3 which used OpenGL 1.1 functions a lot more than Vanilla does.

Ah, that explains why TDM crashes to desktop immediately if you try to activate them using r_logfile!

Moonbo · February 10, 2015

Reading this thread is like stumbling into a foreign language forum

revelator · February 11, 2015

Its not that bad but yeah for a first timer it might sound like gibberish once you get cracking with some code it all starts to come together pretty fast though

The Math is the bitch here that takes years to learn and im still not done besides being on the scene for around 20 years hehe.

Why oh why do i suck so much at algebra im rather good at the rest of the math but i could newer wrap my head around algebra.

February 11, 2015

Its not that bad but yeah for a first timer it might sound like gibberish once you get cracking with some code it all starts to come together pretty fast though
The Math is the bitch here that takes years to learn and im still not done besides being on the scene for around 20 years hehe.
Why oh why do i suck so much at algebra im rather good at the rest of the math but i could newer wrap my head around algebra.

If you have any specific questions, I'm right here.

I've got a BA in mathematics and currently am working on my MA in mathematics/compsci.

Obsttorte · February 11, 2015

Me, too. I'm Diplom Mathematician (thats the german equivalent of a Master). So if you have mathematical questions, just ask and I'd be happy to help.

revelator · February 11, 2015

Heh thanks peeps hope im not getting to old to learn things like that im near on my fifties agh.

If you need a hand with electronics im a major allthough its been years since i last used my skills.

Im also one of the last graduates in denmark who can repair and build tube amplifiers

Guess my teacher sucked as much at algebra as i ended up doing

but ok my school years where back in the days where more advanced math only just surfaced in normal school here in denmark so i guess

he had just as much trouble explaining it as i had understanding it.

gnartsch · February 11, 2015

Reading this thread is like stumbling into a foreign language forum

Same here when it comes to funcstatics, worldspawn and any type of DromED-brushes and whatnot.

revelator · February 11, 2015

Foreign try danish -> ja vores sprog er verdens sværeste

Tels · February 14, 2015

Would also be cool if the entire SSE stuff could be implemented in Linux. Currently it justs fallsback to normal code there. Back then I fixed a few of the detections (so Linux at least knows about what properties the current CPU has), but it's not used much...

bug in SSE math

Recommended Posts

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Link to comment

Share on other sites

Join the conversation

Recent Status Updates