Fix simd & Vector oopsies + lots of tests

This commit is contained in:
Charlie 2024-07-03 00:41:52 +02:00
parent e52e1a403e
commit aceadf4aca
4 changed files with 401 additions and 446 deletions

View file

@ -32,6 +32,6 @@ typedef struct Context_Extra {
// //
// Comment & Uncomment to swap projects // Comment & Uncomment to swap projects
// #include "oogabooga/examples/renderer_stress_test.c" #include "oogabooga/examples/renderer_stress_test.c"
// #include "oogabooga/examples/minimal_game_loop.c" // #include "oogabooga/examples/minimal_game_loop.c"
#include "entry_randygame.c" // #include "entry_randygame.c"

View file

@ -88,6 +88,8 @@ Cpu_Capabilities query_cpu_capabilities() {
return result; return result;
} }
void* memcpy(void* dest, const void* source, size_t size);
void (*simd_add_float32_64 )(float32 *a, float32 *b, float32* result) = 0; void (*simd_add_float32_64 )(float32 *a, float32 *b, float32* result) = 0;
void (*simd_add_float32_128)(float32 *a, float32 *b, float32* result) = 0; void (*simd_add_float32_128)(float32 *a, float32 *b, float32* result) = 0;
void (*simd_add_float32_256)(float32 *a, float32 *b, float32* result) = 0; void (*simd_add_float32_256)(float32 *a, float32 *b, float32* result) = 0;
@ -347,33 +349,23 @@ inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result) {
// SSE 1 float32 // SSE 1 float32
inline void sse1_add_float32_64(float32 *a, float32 *b, float32* result) { inline void sse1_add_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addss %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse1_add_float32_128(float32 *a, float32 *b, float32* result) { inline void sse1_add_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -381,33 +373,23 @@ inline void sse1_add_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse1_sub_float32_64(float32 *a, float32 *b, float32* result) { inline void sse1_sub_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subss %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse1_sub_float32_128(float32 *a, float32 *b, float32* result) { inline void sse1_sub_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subps %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -415,33 +397,23 @@ inline void sse1_sub_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse1_mul_float32_64(float32 *a, float32 *b, float32* result) { inline void sse1_mul_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulss %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse1_mul_float32_128(float32 *a, float32 *b, float32* result) { inline void sse1_mul_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulps %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -449,33 +421,23 @@ inline void sse1_mul_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse1_div_float32_64(float32 *a, float32 *b, float32* result) { inline void sse1_div_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divss %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse1_div_float32_128(float32 *a, float32 *b, float32* result) { inline void sse1_div_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divps %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -484,33 +446,23 @@ inline void sse1_div_float32_128(float32 *a, float32 *b, float32* result) {
// SSE 2 float32 // SSE 2 float32
inline void sse2_add_float32_64(float32 *a, float32 *b, float32* result) { inline void sse2_add_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addss %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse2_add_float32_128(float32 *a, float32 *b, float32* result) { inline void sse2_add_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -518,33 +470,23 @@ inline void sse2_add_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse2_sub_float32_64(float32 *a, float32 *b, float32* result) { inline void sse2_sub_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subss %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse2_sub_float32_128(float32 *a, float32 *b, float32* result) { inline void sse2_sub_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subps %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -552,33 +494,23 @@ inline void sse2_sub_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse2_mul_float32_64(float32 *a, float32 *b, float32* result) { inline void sse2_mul_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulss %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse2_mul_float32_128(float32 *a, float32 *b, float32* result) { inline void sse2_mul_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulps %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -586,33 +518,23 @@ inline void sse2_mul_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse2_div_float32_64(float32 *a, float32 *b, float32* result) { inline void sse2_div_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divss %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse2_div_float32_128(float32 *a, float32 *b, float32* result) { inline void sse2_div_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divps %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -658,33 +580,23 @@ inline void sse2_mul_int32_128(s32 *a, s32 *b, s32* result) {
// SSE 3 float32 // SSE 3 float32
inline void sse3_add_float32_64(float32 *a, float32 *b, float32* result) { inline void sse3_add_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addss %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse3_add_float32_128(float32 *a, float32 *b, float32* result) { inline void sse3_add_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -692,33 +604,23 @@ inline void sse3_add_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse3_sub_float32_64(float32 *a, float32 *b, float32* result) { inline void sse3_sub_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subss %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse3_sub_float32_128(float32 *a, float32 *b, float32* result) { inline void sse3_sub_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subps %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -726,33 +628,23 @@ inline void sse3_sub_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse3_mul_float32_64(float32 *a, float32 *b, float32* result) { inline void sse3_mul_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulss %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse3_mul_float32_128(float32 *a, float32 *b, float32* result) { inline void sse3_mul_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulps %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -760,33 +652,23 @@ inline void sse3_mul_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse3_div_float32_64(float32 *a, float32 *b, float32* result) { inline void sse3_div_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divss %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse3_div_float32_128(float32 *a, float32 *b, float32* result) { inline void sse3_div_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divps %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -795,33 +677,23 @@ inline void sse3_div_float32_128(float32 *a, float32 *b, float32* result) {
// SSSE 3 float32 // SSSE 3 float32
inline void ssse3_add_float32_64(float32 *a, float32 *b, float32* result) { inline void ssse3_add_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addss %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void ssse3_add_float32_128(float32 *a, float32 *b, float32* result) { inline void ssse3_add_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -829,33 +701,23 @@ inline void ssse3_add_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void ssse3_sub_float32_64(float32 *a, float32 *b, float32* result) { inline void ssse3_sub_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subss %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void ssse3_sub_float32_128(float32 *a, float32 *b, float32* result) { inline void ssse3_sub_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subps %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -863,33 +725,23 @@ inline void ssse3_sub_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void ssse3_mul_float32_64(float32 *a, float32 *b, float32* result) { inline void ssse3_mul_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulss %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void ssse3_mul_float32_128(float32 *a, float32 *b, float32* result) { inline void ssse3_mul_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulps %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -897,33 +749,23 @@ inline void ssse3_mul_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void ssse3_div_float32_64(float32 *a, float32 *b, float32* result) { inline void ssse3_div_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divss %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void ssse3_div_float32_128(float32 *a, float32 *b, float32* result) { inline void ssse3_div_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divps %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -932,33 +774,23 @@ inline void ssse3_div_float32_128(float32 *a, float32 *b, float32* result) {
// SSE4.1 float32 // SSE4.1 float32
inline void sse41_add_float32_64(float32 *a, float32 *b, float32* result) { inline void sse41_add_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addss %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse41_add_float32_128(float32 *a, float32 *b, float32* result) { inline void sse41_add_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -966,33 +798,23 @@ inline void sse41_add_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse41_sub_float32_64(float32 *a, float32 *b, float32* result) { inline void sse41_sub_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subss %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse41_sub_float32_128(float32 *a, float32 *b, float32* result) { inline void sse41_sub_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subps %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -1000,33 +822,23 @@ inline void sse41_sub_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse41_mul_float32_64(float32 *a, float32 *b, float32* result) { inline void sse41_mul_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulss %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse41_mul_float32_128(float32 *a, float32 *b, float32* result) { inline void sse41_mul_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulps %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -1034,33 +846,23 @@ inline void sse41_mul_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse41_div_float32_64(float32 *a, float32 *b, float32* result) { inline void sse41_div_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divss %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse41_div_float32_128(float32 *a, float32 *b, float32* result) { inline void sse41_div_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divps %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -1069,33 +871,23 @@ inline void sse41_div_float32_128(float32 *a, float32 *b, float32* result) {
// SSE4.2 float32 // SSE4.2 float32
inline void sse42_add_float32_64(float32 *a, float32 *b, float32* result) { inline void sse42_add_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t" // Load 2 floats from a into xmm0
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t" // Load 2 floats from b into xmm1
"addss %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t" // Add xmm1 to xmm0 (packed single precision)
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t" // Store result from xmm0 to result
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse42_add_float32_128(float32 *a, float32 *b, float32* result) { inline void sse42_add_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"addps %%xmm1, %%xmm0\n\t" "addps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -1103,33 +895,23 @@ inline void sse42_add_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse42_sub_float32_64(float32 *a, float32 *b, float32* result) { inline void sse42_sub_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subss %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse42_sub_float32_128(float32 *a, float32 *b, float32* result) { inline void sse42_sub_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"subps %%xmm1, %%xmm0\n\t" "subps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -1138,33 +920,23 @@ inline void sse42_sub_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse42_mul_float32_64(float32 *a, float32 *b, float32* result) { inline void sse42_mul_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulss %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse42_mul_float32_128(float32 *a, float32 *b, float32* result) { inline void sse42_mul_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"mulps %%xmm1, %%xmm0\n\t" "mulps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
@ -1172,33 +944,23 @@ inline void sse42_mul_float32_128(float32 *a, float32 *b, float32* result) {
} }
inline void sse42_div_float32_64(float32 *a, float32 *b, float32* result) { inline void sse42_div_float32_64(float32 *a, float32 *b, float32* result) {
float a128[2];
a128[0] = *a;
a128[1] = 0;
float b128[2];
b128[0] = *b;
b128[1] = 0;
float r128[2];
r128[0] = *result;
r128[1] = 0;
__asm__ ( __asm__ (
"movss (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movss (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divss %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movss %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a128), "r" (b128), "r" (r128) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"
); );
*result = r128[0];
} }
inline void sse42_div_float32_128(float32 *a, float32 *b, float32* result) { inline void sse42_div_float32_128(float32 *a, float32 *b, float32* result) {
__asm__ ( __asm__ (
"movaps (%0), %%xmm0\n\t" "movups (%0), %%xmm0\n\t"
"movaps (%1), %%xmm1\n\t" "movups (%1), %%xmm1\n\t"
"divps %%xmm1, %%xmm0\n\t" "divps %%xmm1, %%xmm0\n\t"
"movaps %%xmm0, (%2)\n\t" "movups %%xmm0, (%2)\n\t"
: :
: "r" (a), "r" (b), "r" (result) : "r" (a), "r" (b), "r" (result)
: "xmm0", "xmm1" : "xmm0", "xmm1"

View file

@ -64,27 +64,31 @@ inline Vector2 v2_divf(Vector2 a, float32 s) {
} }
inline Vector3 v3_add(Vector3 a, Vector3 b) { inline Vector3 v3_add(Vector3 a, Vector3 b) {
simd_add_float32_64((f32*)&a, (f32*)&b, (f32*)&a); Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
a.z += b.z; Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
return a; simd_add_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128);
return a128.xyz;
} }
inline Vector3 v3_sub(Vector3 a, Vector3 b) { inline Vector3 v3_sub(Vector3 a, Vector3 b) {
simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
a.z -= b.z; Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
return a; simd_sub_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128);
return a128.xyz;
} }
inline Vector3 v3_mul(Vector3 a, Vector3 b) { inline Vector3 v3_mul(Vector3 a, Vector3 b) {
simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
a.z *= b.z; Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
return a; simd_mul_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128);
return a128.xyz;
} }
inline Vector3 v3_mulf(Vector3 a, float32 s) { inline Vector3 v3_mulf(Vector3 a, float32 s) {
return v3_mul(a, v3(s, s, s)); return v3_mul(a, v3(s, s, s));
} }
inline Vector3 v3_div(Vector3 a, Vector3 b) { inline Vector3 v3_div(Vector3 a, Vector3 b) {
simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
a.z /= b.z; Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
return a; simd_div_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128);
return a128.xyz;
} }
inline Vector3 v3_divf(Vector3 a, float32 s) { inline Vector3 v3_divf(Vector3 a, float32 s) {
return v3_div(a, v3(s, s, s)); return v3_div(a, v3(s, s, s));

View file

@ -1,4 +1,9 @@
///
/// Most of these are generated by gpt so there might be some goofyness
///
void log_heap() { void log_heap() {
os_spinlock_lock(heap_lock); os_spinlock_lock(heap_lock);
print("\nHEAP:\n"); print("\nHEAP:\n");
@ -730,6 +735,186 @@ void test_simd() {
print(" simd took %llu cycles ", cycles); print(" simd took %llu cycles ", cycles);
} }
void test_math_library() {
// Test vector creation and access
Vector2 v2_test = v2(1.0f, 2.0f);
assert(v2_test.x == 1.0f && v2_test.y == 2.0f, "v2 creation incorrect");
Vector3 v3_test = v3(1.0f, 2.0f, 3.0f);
assert(v3_test.x == 1.0f && v3_test.y == 2.0f && v3_test.z == 3.0f, "v3 creation incorrect");
Vector4 v4_test = v4(1.0f, 2.0f, 3.0f, 4.0f);
assert(v4_test.x == 1.0f && v4_test.y == 2.0f && v4_test.z == 3.0f && v4_test.w == 4.0f, "v4 creation incorrect");
// Test vector operations
Vector2 v2_a = v2(3.0f, 4.0f);
Vector2 v2_b = v2(1.0f, 2.0f);
Vector2 v2_result = v2_add(v2_a, v2_b);
assert(v2_result.x == 4.0f && v2_result.y == 6.0f, "v2_add incorrect. %.3f, %.3f", v2_result.x, v2_result.y);
v2_result = v2_sub(v2_a, v2_b);
assert(v2_result.x == 2.0f && v2_result.y == 2.0f, "v2_sub incorrect");
v2_result = v2_mul(v2_a, v2_b);
assert(v2_result.x == 3.0f && v2_result.y == 8.0f, "v2_mul incorrect");
v2_result = v2_div(v2_a, v2_b);
assert(v2_result.x == 3.0f && v2_result.y == 2.0f, "v2_div incorrect");
v2_result = v2_mulf(v2_a, 2.0f);
assert(v2_result.x == 6.0f && v2_result.y == 8.0f, "v2_mulf incorrect");
v2_result = v2_divf(v2_a, 2.0f);
assert(v2_result.x == 1.5f && v2_result.y == 2.0f, "v2_divf incorrect");
// Test matrix operations
Matrix4 m1 = m4_scalar(2.0f);
assert(m1.data[0] == 2.0f && m1.data[5] == 2.0f && m1.data[10] == 2.0f && m1.data[15] == 2.0f, "m4_scalar incorrect");
Vector3 translation = v3(1.0f, 2.0f, 3.0f);
Matrix4 m2 = m4_make_translation(translation);
assert(m2.m[0][3] == 1.0f && m2.m[1][3] == 2.0f && m2.m[2][3] == 3.0f, "m4_make_translation incorrect");
Vector3 scale = v3(2.0f, 3.0f, 4.0f);
Matrix4 m4 = m4_make_scale(scale);
assert(m4.m[0][0] == 2.0f && m4.m[1][1] == 3.0f && m4.m[2][2] == 4.0f, "m4_make_scale incorrect");
// Test orthographic projection matrix
Matrix4 ortho = m4_make_orthographic_projection(-1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f);
assert(ortho.m[0][0] == 1.0f && ortho.m[1][1] == 1.0f && ortho.m[2][2] == 1.0f, "m4_make_orthographic_projection incorrect");
// Test matrix multiplication
Matrix4 m5 = m4_scalar(1.0f);
m5.m[0][3] = 1.0f; m5.m[1][3] = 2.0f; m5.m[2][3] = 3.0f;
Matrix4 result_matrix = m4_multiply(m2, m5);
assert(result_matrix.m[0][3] == 2.0f && result_matrix.m[1][3] == 4.0f && result_matrix.m[2][3] == 6.0f, "m4_multiply incorrect");
// Test matrix inverse
Matrix4 identity = m4_scalar(1.0f);
Matrix4 inverse_matrix = m4_inverse(identity);
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
assert(inverse_matrix.m[i][j] == identity.m[i][j], "m4_inverse incorrect for identity matrix");
}
}
// Test Vector2 creation
Vector2 v2_test1 = v2(1.0f, 2.0f);
assert(v2_test1.x == 1.0f && v2_test1.y == 2.0f, "Vector2 creation failed");
Vector2 v2_test2 = v2(3.0f, 4.0f);
assert(v2_test2.x == 3.0f && v2_test2.y == 4.0f, "Vector2 creation failed");
// Test Vector2 addition
Vector2 v2_add_result = v2_add(v2_test1, v2_test2);
assert(v2_add_result.x == 4.0f && v2_add_result.y == 6.0f, "Vector2 addition failed");
// Test Vector2 subtraction
Vector2 v2_sub_result = v2_sub(v2_test2, v2_test1);
assert(v2_sub_result.x == 2.0f && v2_sub_result.y == 2.0f, "Vector2 subtraction failed");
// Test Vector2 multiplication
Vector2 v2_mul_result = v2_mul(v2_test1, v2_test2);
assert(v2_mul_result.x == 3.0f && v2_mul_result.y == 8.0f, "Vector2 multiplication failed");
// Test Vector2 scalar multiplication
Vector2 v2_mulf_result = v2_mulf(v2_test1, 2.0f);
assert(v2_mulf_result.x == 2.0f && v2_mulf_result.y == 4.0f, "Vector2 scalar multiplication failed");
// Test Vector2 division
Vector2 v2_div_result = v2_div(v2_test2, v2_test1);
assert(v2_div_result.x == 3.0f && v2_div_result.y == 2.0f, "Vector2 division failed");
// Test Vector2 scalar division
Vector2 v2_divf_result = v2_divf(v2_test2, 2.0f);
assert(v2_divf_result.x == 1.5f && v2_divf_result.y == 2.0f, "Vector2 scalar division failed");
// Test Vector2 normalization
Vector2 v2_norm_result = v2_normalize(v2(3.0f, 4.0f));
assert(fabs(v2_norm_result.x - 0.6f) < 1e-6 && fabs(v2_norm_result.y - 0.8f) < 1e-6, "Vector2 normalization failed");
// Test Vector2 rotation
Vector2 pivot = v2(0.0f, 0.0f);
Vector2 point = v2(1.0f, 0.0f);
Vector2 v2_rot_result = v2_rotate_point_around_pivot(point, pivot, PI32 / 2.0f);
assert(fabs(v2_rot_result.x) < 1e-6 && fabs(v2_rot_result.y - 1.0f) < 1e-6, "Vector2 rotation around pivot failed");
// Test Vector3 creation
Vector3 v3_test1 = v3(1.0f, 2.0f, 3.0f);
assert(v3_test1.x == 1.0f && v3_test1.y == 2.0f && v3_test1.z == 3.0f, "Vector3 creation failed");
Vector3 v3_test2 = v3(4.0f, 5.0f, 6.0f);
assert(v3_test2.x == 4.0f && v3_test2.y == 5.0f && v3_test2.z == 6.0f, "Vector3 creation failed");
// Test Vector3 addition
Vector3 v3_add_result = v3_add(v3_test1, v3_test2);
assert(v3_add_result.x == 5.0f && v3_add_result.y == 7.0f && v3_add_result.z == 9.0f, "Vector3 addition failed");
// Test Vector3 subtraction
Vector3 v3_sub_result = v3_sub(v3_test2, v3_test1);
assert(v3_sub_result.x == 3.0f && v3_sub_result.y == 3.0f && v3_sub_result.z == 3.0f, "Vector3 subtraction failed");
// Test Vector3 multiplication
Vector3 v3_mul_result = v3_mul(v3_test1, v3_test2);
assert(v3_mul_result.x == 4.0f && v3_mul_result.y == 10.0f && v3_mul_result.z == 18.0f, "Vector3 multiplication failed");
// Test Vector3 scalar multiplication
Vector3 v3_mulf_result = v3_mulf(v3_test1, 2.0f);
assert(v3_mulf_result.x == 2.0f && v3_mulf_result.y == 4.0f && v3_mulf_result.z == 6.0f, "Vector3 scalar multiplication failed");
// Test Vector3 division
Vector3 v3_div_result = v3_div(v3_test2, v3_test1);
assert(v3_div_result.x == 4.0f && v3_div_result.y == 2.5f && v3_div_result.z == 2.0f, "Vector3 division failed");
// Test Vector3 scalar division
Vector3 v3_divf_result = v3_divf(v3_test2, 2.0f);
assert(v3_divf_result.x == 2.0f && v3_divf_result.y == 2.5f && v3_divf_result.z == 3.0f, "Vector3 scalar division failed");
// Test Vector4 creation
Vector4 v4_test1 = v4(1.0f, 2.0f, 3.0f, 4.0f);
assert(v4_test1.x == 1.0f && v4_test1.y == 2.0f && v4_test1.z == 3.0f && v4_test1.w == 4.0f, "Vector4 creation failed");
Vector4 v4_test2 = v4(5.0f, 6.0f, 7.0f, 8.0f);
assert(v4_test2.x == 5.0f && v4_test2.y == 6.0f && v4_test2.z == 7.0f && v4_test2.w == 8.0f, "Vector4 creation failed");
// Test Vector4 addition
Vector4 v4_add_result = v4_add(v4_test1, v4_test2);
assert(v4_add_result.x == 6.0f && v4_add_result.y == 8.0f && v4_add_result.z == 10.0f && v4_add_result.w == 12.0f, "Vector4 addition failed");
// Test Vector4 subtraction
Vector4 v4_sub_result = v4_sub(v4_test2, v4_test1);
assert(v4_sub_result.x == 4.0f && v4_sub_result.y == 4.0f && v4_sub_result.z == 4.0f && v4_sub_result.w == 4.0f, "Vector4 subtraction failed");
// Test Vector4 multiplication
Vector4 v4_mul_result = v4_mul(v4_test1, v4_test2);
assert(v4_mul_result.x == 5.0f && v4_mul_result.y == 12.0f && v4_mul_result.z == 21.0f && v4_mul_result.w == 32.0f, "Vector4 multiplication failed");
// Test Vector4 scalar multiplication
Vector4 v4_mulf_result = v4_mulf(v4_test1, 2.0f);
assert(v4_mulf_result.x == 2.0f && v4_mulf_result.y == 4.0f && v4_mulf_result.z == 6.0f && v4_mulf_result.w == 8.0f, "Vector4 scalar multiplication failed");
// Test Vector4 division
Vector4 v4_div_result = v4_div(v4_test2, v4_test1);
assert(v4_div_result.x == 5.0f && v4_div_result.y == 3.0f && v4_div_result.w == 2.0f, "Vector4 division failed");
// Test Vector4 scalar division
Vector4 v4_divf_result = v4_divf(v4_test2, 2.0f);
assert(v4_divf_result.x == 2.5f && v4_divf_result.y == 3.0f && v4_divf_result.z == 3.5f && v4_divf_result.w == 4.0f, "Vector4 scalar division failed");
// Test mixed vector and scalar operations
Vector2 mixed_v2 = v2(2.0f, 4.0f);
Vector2 mixed_v2_result = v2_mulf(mixed_v2, 0.5f);
assert(mixed_v2_result.x == 1.0f && mixed_v2_result.y == 2.0f, "Mixed Vector2 scalar multiplication failed");
Vector3 mixed_v3 = v3(3.0f, 6.0f, 9.0f);
Vector3 mixed_v3_result = v3_divf(mixed_v3, 3.0f);
assert(mixed_v3_result.x == 1.0f && mixed_v3_result.y == 2.0f && mixed_v3_result.z == 3.0f, "Mixed Vector3 scalar division failed");
Vector4 mixed_v4 = v4(4.0f, 8.0f, 12.0f, 16.0f);
Vector4 mixed_v4_result = v4_mulf(mixed_v4, 0.25f);
assert(mixed_v4_result.x == 1.0f && mixed_v4_result.y == 2.0f && mixed_v4_result.z == 3.0f && mixed_v4_result.w == 4.0f, "Mixed Vector4 scalar multiplication failed");
}
void oogabooga_run_tests() { void oogabooga_run_tests() {
print("Testing simd... "); print("Testing simd... ");
test_simd(); test_simd();
@ -763,4 +948,8 @@ void oogabooga_run_tests() {
test_file_io(); test_file_io();
print("OK!\n"); print("OK!\n");
print("Testing file IO... ");
test_math_library();
print("OK!\n");
} }