From aceadf4aca94159f18ed98968ad6f897d8c0f625 Mon Sep 17 00:00:00 2001 From: Charlie <66182434+asbott@users.noreply.github.com> Date: Wed, 3 Jul 2024 00:41:52 +0200 Subject: [PATCH] Fix simd & Vector oopsies + lots of tests --- build.c | 4 +- oogabooga/cpu.c | 626 ++++++++++++++------------------------------ oogabooga/linmath.c | 28 +- oogabooga/tests.c | 189 +++++++++++++ 4 files changed, 401 insertions(+), 446 deletions(-) diff --git a/build.c b/build.c index b7a0bf9..2ee8547 100644 --- a/build.c +++ b/build.c @@ -32,6 +32,6 @@ typedef struct Context_Extra { // // Comment & Uncomment to swap projects -// #include "oogabooga/examples/renderer_stress_test.c" +#include "oogabooga/examples/renderer_stress_test.c" // #include "oogabooga/examples/minimal_game_loop.c" -#include "entry_randygame.c" \ No newline at end of file +// #include "entry_randygame.c" \ No newline at end of file diff --git a/oogabooga/cpu.c b/oogabooga/cpu.c index 7e70070..91852ae 100644 --- a/oogabooga/cpu.c +++ b/oogabooga/cpu.c @@ -88,6 +88,8 @@ Cpu_Capabilities query_cpu_capabilities() { return result; } +void* memcpy(void* dest, const void* source, size_t size); + void (*simd_add_float32_64 )(float32 *a, float32 *b, float32* result) = 0; void (*simd_add_float32_128)(float32 *a, float32 *b, float32* result) = 0; void (*simd_add_float32_256)(float32 *a, float32 *b, float32* result) = 0; @@ -347,33 +349,23 @@ inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result) { // SSE 1 float32 inline void sse1_add_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "addss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a), "r" (b), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse1_add_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "addps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -381,33 +373,23 @@ inline void sse1_add_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse1_sub_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "subss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a), "r" (b), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse1_sub_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "subps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -415,33 +397,23 @@ inline void sse1_sub_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse1_mul_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "mulss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a), "r" (b), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse1_mul_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "mulps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -449,33 +421,23 @@ inline void sse1_mul_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse1_div_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "divss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a), "r" (b), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse1_div_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "divps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -484,33 +446,23 @@ inline void sse1_div_float32_128(float32 *a, float32 *b, float32* result) { // SSE 2 float32 inline void sse2_add_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "addss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a), "r" (b), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse2_add_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "addps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -518,33 +470,23 @@ inline void sse2_add_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse2_sub_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "subss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a), "r" (b), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse2_sub_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "subps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -552,33 +494,23 @@ inline void sse2_sub_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse2_mul_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "mulss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a), "r" (b), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse2_mul_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "mulps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -586,33 +518,23 @@ inline void sse2_mul_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse2_div_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "divss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse2_div_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "divps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -658,33 +580,23 @@ inline void sse2_mul_int32_128(s32 *a, s32 *b, s32* result) { // SSE 3 float32 inline void sse3_add_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "addss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse3_add_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "addps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -692,33 +604,23 @@ inline void sse3_add_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse3_sub_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "subss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse3_sub_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "subps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -726,33 +628,23 @@ inline void sse3_sub_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse3_mul_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "mulss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse3_mul_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "mulps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -760,33 +652,23 @@ inline void sse3_mul_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse3_div_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "divss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse3_div_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "divps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -795,33 +677,23 @@ inline void sse3_div_float32_128(float32 *a, float32 *b, float32* result) { // SSSE 3 float32 inline void ssse3_add_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "addss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void ssse3_add_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "addps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -829,33 +701,23 @@ inline void ssse3_add_float32_128(float32 *a, float32 *b, float32* result) { } inline void ssse3_sub_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "subss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void ssse3_sub_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "subps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -863,33 +725,23 @@ inline void ssse3_sub_float32_128(float32 *a, float32 *b, float32* result) { } inline void ssse3_mul_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "mulss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void ssse3_mul_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "mulps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -897,33 +749,23 @@ inline void ssse3_mul_float32_128(float32 *a, float32 *b, float32* result) { } inline void ssse3_div_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "divss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void ssse3_div_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "divps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -932,33 +774,23 @@ inline void ssse3_div_float32_128(float32 *a, float32 *b, float32* result) { // SSE4.1 float32 inline void sse41_add_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "addss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse41_add_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "addps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -966,33 +798,23 @@ inline void sse41_add_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse41_sub_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "subss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse41_sub_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "subps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -1000,33 +822,23 @@ inline void sse41_sub_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse41_mul_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "mulss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse41_mul_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "mulps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -1034,33 +846,23 @@ inline void sse41_mul_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse41_div_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "divss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse41_div_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "divps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -1069,33 +871,23 @@ inline void sse41_div_float32_128(float32 *a, float32 *b, float32* result) { // SSE4.2 float32 inline void sse42_add_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "addss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" // Load 2 floats from a into xmm0 + "movups (%1), %%xmm1\n\t" // Load 2 floats from b into xmm1 + "addps %%xmm1, %%xmm0\n\t" // Add xmm1 to xmm0 (packed single precision) + "movups %%xmm0, (%2)\n\t" // Store result from xmm0 to result : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse42_add_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "addps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -1103,33 +895,23 @@ inline void sse42_add_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse42_sub_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "subss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse42_sub_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "subps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -1138,33 +920,23 @@ inline void sse42_sub_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse42_mul_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "mulss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse42_mul_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "mulps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" @@ -1172,33 +944,23 @@ inline void sse42_mul_float32_128(float32 *a, float32 *b, float32* result) { } inline void sse42_div_float32_64(float32 *a, float32 *b, float32* result) { - float a128[2]; - a128[0] = *a; - a128[1] = 0; - float b128[2]; - b128[0] = *b; - b128[1] = 0; - float r128[2]; - r128[0] = *result; - r128[1] = 0; __asm__ ( - "movss (%0), %%xmm0\n\t" - "movss (%1), %%xmm1\n\t" - "divss %%xmm1, %%xmm0\n\t" - "movss %%xmm0, (%2)\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movups %%xmm0, (%2)\n\t" : - : "r" (a128), "r" (b128), "r" (r128) + : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" ); - *result = r128[0]; } inline void sse42_div_float32_128(float32 *a, float32 *b, float32* result) { __asm__ ( - "movaps (%0), %%xmm0\n\t" - "movaps (%1), %%xmm1\n\t" + "movups (%0), %%xmm0\n\t" + "movups (%1), %%xmm1\n\t" "divps %%xmm1, %%xmm0\n\t" - "movaps %%xmm0, (%2)\n\t" + "movups %%xmm0, (%2)\n\t" : : "r" (a), "r" (b), "r" (result) : "xmm0", "xmm1" diff --git a/oogabooga/linmath.c b/oogabooga/linmath.c index 51eb229..3ad91c8 100644 --- a/oogabooga/linmath.c +++ b/oogabooga/linmath.c @@ -64,27 +64,31 @@ inline Vector2 v2_divf(Vector2 a, float32 s) { } inline Vector3 v3_add(Vector3 a, Vector3 b) { - simd_add_float32_64((f32*)&a, (f32*)&b, (f32*)&a); - a.z += b.z; - return a; + Vector4 a128 = v4(a.x, a.y, a.z, 0.0); + Vector4 b128 = v4(b.x, b.y, b.z, 0.0); + simd_add_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); + return a128.xyz; } inline Vector3 v3_sub(Vector3 a, Vector3 b) { - simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); - a.z -= b.z; - return a; + Vector4 a128 = v4(a.x, a.y, a.z, 0.0); + Vector4 b128 = v4(b.x, b.y, b.z, 0.0); + simd_sub_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); + return a128.xyz; } inline Vector3 v3_mul(Vector3 a, Vector3 b) { - simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); - a.z *= b.z; - return a; + Vector4 a128 = v4(a.x, a.y, a.z, 0.0); + Vector4 b128 = v4(b.x, b.y, b.z, 0.0); + simd_mul_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); + return a128.xyz; } inline Vector3 v3_mulf(Vector3 a, float32 s) { return v3_mul(a, v3(s, s, s)); } inline Vector3 v3_div(Vector3 a, Vector3 b) { - simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); - a.z /= b.z; - return a; + Vector4 a128 = v4(a.x, a.y, a.z, 0.0); + Vector4 b128 = v4(b.x, b.y, b.z, 0.0); + simd_div_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); + return a128.xyz; } inline Vector3 v3_divf(Vector3 a, float32 s) { return v3_div(a, v3(s, s, s)); diff --git a/oogabooga/tests.c b/oogabooga/tests.c index 38964a5..5bb42f3 100644 --- a/oogabooga/tests.c +++ b/oogabooga/tests.c @@ -1,4 +1,9 @@ + +/// +/// Most of these are generated by gpt so there might be some goofyness +/// + void log_heap() { os_spinlock_lock(heap_lock); print("\nHEAP:\n"); @@ -730,6 +735,186 @@ void test_simd() { print(" simd took %llu cycles ", cycles); } +void test_math_library() { + + // Test vector creation and access + Vector2 v2_test = v2(1.0f, 2.0f); + assert(v2_test.x == 1.0f && v2_test.y == 2.0f, "v2 creation incorrect"); + + Vector3 v3_test = v3(1.0f, 2.0f, 3.0f); + assert(v3_test.x == 1.0f && v3_test.y == 2.0f && v3_test.z == 3.0f, "v3 creation incorrect"); + + Vector4 v4_test = v4(1.0f, 2.0f, 3.0f, 4.0f); + assert(v4_test.x == 1.0f && v4_test.y == 2.0f && v4_test.z == 3.0f && v4_test.w == 4.0f, "v4 creation incorrect"); + + // Test vector operations + Vector2 v2_a = v2(3.0f, 4.0f); + Vector2 v2_b = v2(1.0f, 2.0f); + Vector2 v2_result = v2_add(v2_a, v2_b); + assert(v2_result.x == 4.0f && v2_result.y == 6.0f, "v2_add incorrect. %.3f, %.3f", v2_result.x, v2_result.y); + + v2_result = v2_sub(v2_a, v2_b); + assert(v2_result.x == 2.0f && v2_result.y == 2.0f, "v2_sub incorrect"); + + v2_result = v2_mul(v2_a, v2_b); + assert(v2_result.x == 3.0f && v2_result.y == 8.0f, "v2_mul incorrect"); + + v2_result = v2_div(v2_a, v2_b); + assert(v2_result.x == 3.0f && v2_result.y == 2.0f, "v2_div incorrect"); + + v2_result = v2_mulf(v2_a, 2.0f); + assert(v2_result.x == 6.0f && v2_result.y == 8.0f, "v2_mulf incorrect"); + + v2_result = v2_divf(v2_a, 2.0f); + assert(v2_result.x == 1.5f && v2_result.y == 2.0f, "v2_divf incorrect"); + + // Test matrix operations + Matrix4 m1 = m4_scalar(2.0f); + assert(m1.data[0] == 2.0f && m1.data[5] == 2.0f && m1.data[10] == 2.0f && m1.data[15] == 2.0f, "m4_scalar incorrect"); + + Vector3 translation = v3(1.0f, 2.0f, 3.0f); + Matrix4 m2 = m4_make_translation(translation); + assert(m2.m[0][3] == 1.0f && m2.m[1][3] == 2.0f && m2.m[2][3] == 3.0f, "m4_make_translation incorrect"); + + Vector3 scale = v3(2.0f, 3.0f, 4.0f); + Matrix4 m4 = m4_make_scale(scale); + assert(m4.m[0][0] == 2.0f && m4.m[1][1] == 3.0f && m4.m[2][2] == 4.0f, "m4_make_scale incorrect"); + + // Test orthographic projection matrix + Matrix4 ortho = m4_make_orthographic_projection(-1.0f, 1.0f, -1.0f, 1.0f, 1.0f, -1.0f); + assert(ortho.m[0][0] == 1.0f && ortho.m[1][1] == 1.0f && ortho.m[2][2] == 1.0f, "m4_make_orthographic_projection incorrect"); + + // Test matrix multiplication + Matrix4 m5 = m4_scalar(1.0f); + m5.m[0][3] = 1.0f; m5.m[1][3] = 2.0f; m5.m[2][3] = 3.0f; + Matrix4 result_matrix = m4_multiply(m2, m5); + assert(result_matrix.m[0][3] == 2.0f && result_matrix.m[1][3] == 4.0f && result_matrix.m[2][3] == 6.0f, "m4_multiply incorrect"); + + // Test matrix inverse + Matrix4 identity = m4_scalar(1.0f); + Matrix4 inverse_matrix = m4_inverse(identity); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + assert(inverse_matrix.m[i][j] == identity.m[i][j], "m4_inverse incorrect for identity matrix"); + } + } + + // Test Vector2 creation + Vector2 v2_test1 = v2(1.0f, 2.0f); + assert(v2_test1.x == 1.0f && v2_test1.y == 2.0f, "Vector2 creation failed"); + + Vector2 v2_test2 = v2(3.0f, 4.0f); + assert(v2_test2.x == 3.0f && v2_test2.y == 4.0f, "Vector2 creation failed"); + + // Test Vector2 addition + Vector2 v2_add_result = v2_add(v2_test1, v2_test2); + assert(v2_add_result.x == 4.0f && v2_add_result.y == 6.0f, "Vector2 addition failed"); + + // Test Vector2 subtraction + Vector2 v2_sub_result = v2_sub(v2_test2, v2_test1); + assert(v2_sub_result.x == 2.0f && v2_sub_result.y == 2.0f, "Vector2 subtraction failed"); + + // Test Vector2 multiplication + Vector2 v2_mul_result = v2_mul(v2_test1, v2_test2); + assert(v2_mul_result.x == 3.0f && v2_mul_result.y == 8.0f, "Vector2 multiplication failed"); + + // Test Vector2 scalar multiplication + Vector2 v2_mulf_result = v2_mulf(v2_test1, 2.0f); + assert(v2_mulf_result.x == 2.0f && v2_mulf_result.y == 4.0f, "Vector2 scalar multiplication failed"); + + // Test Vector2 division + Vector2 v2_div_result = v2_div(v2_test2, v2_test1); + assert(v2_div_result.x == 3.0f && v2_div_result.y == 2.0f, "Vector2 division failed"); + + // Test Vector2 scalar division + Vector2 v2_divf_result = v2_divf(v2_test2, 2.0f); + assert(v2_divf_result.x == 1.5f && v2_divf_result.y == 2.0f, "Vector2 scalar division failed"); + + // Test Vector2 normalization + Vector2 v2_norm_result = v2_normalize(v2(3.0f, 4.0f)); + assert(fabs(v2_norm_result.x - 0.6f) < 1e-6 && fabs(v2_norm_result.y - 0.8f) < 1e-6, "Vector2 normalization failed"); + + // Test Vector2 rotation + Vector2 pivot = v2(0.0f, 0.0f); + Vector2 point = v2(1.0f, 0.0f); + Vector2 v2_rot_result = v2_rotate_point_around_pivot(point, pivot, PI32 / 2.0f); + assert(fabs(v2_rot_result.x) < 1e-6 && fabs(v2_rot_result.y - 1.0f) < 1e-6, "Vector2 rotation around pivot failed"); + + // Test Vector3 creation + Vector3 v3_test1 = v3(1.0f, 2.0f, 3.0f); + assert(v3_test1.x == 1.0f && v3_test1.y == 2.0f && v3_test1.z == 3.0f, "Vector3 creation failed"); + + Vector3 v3_test2 = v3(4.0f, 5.0f, 6.0f); + assert(v3_test2.x == 4.0f && v3_test2.y == 5.0f && v3_test2.z == 6.0f, "Vector3 creation failed"); + + // Test Vector3 addition + Vector3 v3_add_result = v3_add(v3_test1, v3_test2); + assert(v3_add_result.x == 5.0f && v3_add_result.y == 7.0f && v3_add_result.z == 9.0f, "Vector3 addition failed"); + + // Test Vector3 subtraction + Vector3 v3_sub_result = v3_sub(v3_test2, v3_test1); + assert(v3_sub_result.x == 3.0f && v3_sub_result.y == 3.0f && v3_sub_result.z == 3.0f, "Vector3 subtraction failed"); + + // Test Vector3 multiplication + Vector3 v3_mul_result = v3_mul(v3_test1, v3_test2); + assert(v3_mul_result.x == 4.0f && v3_mul_result.y == 10.0f && v3_mul_result.z == 18.0f, "Vector3 multiplication failed"); + + // Test Vector3 scalar multiplication + Vector3 v3_mulf_result = v3_mulf(v3_test1, 2.0f); + assert(v3_mulf_result.x == 2.0f && v3_mulf_result.y == 4.0f && v3_mulf_result.z == 6.0f, "Vector3 scalar multiplication failed"); + + // Test Vector3 division + Vector3 v3_div_result = v3_div(v3_test2, v3_test1); + assert(v3_div_result.x == 4.0f && v3_div_result.y == 2.5f && v3_div_result.z == 2.0f, "Vector3 division failed"); + + // Test Vector3 scalar division + Vector3 v3_divf_result = v3_divf(v3_test2, 2.0f); + assert(v3_divf_result.x == 2.0f && v3_divf_result.y == 2.5f && v3_divf_result.z == 3.0f, "Vector3 scalar division failed"); + + // Test Vector4 creation + Vector4 v4_test1 = v4(1.0f, 2.0f, 3.0f, 4.0f); + assert(v4_test1.x == 1.0f && v4_test1.y == 2.0f && v4_test1.z == 3.0f && v4_test1.w == 4.0f, "Vector4 creation failed"); + + Vector4 v4_test2 = v4(5.0f, 6.0f, 7.0f, 8.0f); + assert(v4_test2.x == 5.0f && v4_test2.y == 6.0f && v4_test2.z == 7.0f && v4_test2.w == 8.0f, "Vector4 creation failed"); + + // Test Vector4 addition + Vector4 v4_add_result = v4_add(v4_test1, v4_test2); + assert(v4_add_result.x == 6.0f && v4_add_result.y == 8.0f && v4_add_result.z == 10.0f && v4_add_result.w == 12.0f, "Vector4 addition failed"); + + // Test Vector4 subtraction + Vector4 v4_sub_result = v4_sub(v4_test2, v4_test1); + assert(v4_sub_result.x == 4.0f && v4_sub_result.y == 4.0f && v4_sub_result.z == 4.0f && v4_sub_result.w == 4.0f, "Vector4 subtraction failed"); + + // Test Vector4 multiplication + Vector4 v4_mul_result = v4_mul(v4_test1, v4_test2); + assert(v4_mul_result.x == 5.0f && v4_mul_result.y == 12.0f && v4_mul_result.z == 21.0f && v4_mul_result.w == 32.0f, "Vector4 multiplication failed"); + + // Test Vector4 scalar multiplication + Vector4 v4_mulf_result = v4_mulf(v4_test1, 2.0f); + assert(v4_mulf_result.x == 2.0f && v4_mulf_result.y == 4.0f && v4_mulf_result.z == 6.0f && v4_mulf_result.w == 8.0f, "Vector4 scalar multiplication failed"); + + // Test Vector4 division + Vector4 v4_div_result = v4_div(v4_test2, v4_test1); + assert(v4_div_result.x == 5.0f && v4_div_result.y == 3.0f && v4_div_result.w == 2.0f, "Vector4 division failed"); + + // Test Vector4 scalar division + Vector4 v4_divf_result = v4_divf(v4_test2, 2.0f); + assert(v4_divf_result.x == 2.5f && v4_divf_result.y == 3.0f && v4_divf_result.z == 3.5f && v4_divf_result.w == 4.0f, "Vector4 scalar division failed"); + + // Test mixed vector and scalar operations + Vector2 mixed_v2 = v2(2.0f, 4.0f); + Vector2 mixed_v2_result = v2_mulf(mixed_v2, 0.5f); + assert(mixed_v2_result.x == 1.0f && mixed_v2_result.y == 2.0f, "Mixed Vector2 scalar multiplication failed"); + + Vector3 mixed_v3 = v3(3.0f, 6.0f, 9.0f); + Vector3 mixed_v3_result = v3_divf(mixed_v3, 3.0f); + assert(mixed_v3_result.x == 1.0f && mixed_v3_result.y == 2.0f && mixed_v3_result.z == 3.0f, "Mixed Vector3 scalar division failed"); + + Vector4 mixed_v4 = v4(4.0f, 8.0f, 12.0f, 16.0f); + Vector4 mixed_v4_result = v4_mulf(mixed_v4, 0.25f); + assert(mixed_v4_result.x == 1.0f && mixed_v4_result.y == 2.0f && mixed_v4_result.z == 3.0f && mixed_v4_result.w == 4.0f, "Mixed Vector4 scalar multiplication failed"); +} void oogabooga_run_tests() { print("Testing simd... "); test_simd(); @@ -763,4 +948,8 @@ void oogabooga_run_tests() { test_file_io(); print("OK!\n"); + print("Testing file IO... "); + test_math_library(); + print("OK!\n"); + } \ No newline at end of file