I got very sidetracked.

- cpu.c to query cpu capabilities and intrinsics - In init, find best available simd instruction sets and assign simd function ptrs accordingly. - Replace basic vector arithmetic procs with simd implementations
2024-07-03 00:01:11 +02:00 · 2024-07-03 00:01:11 +02:00 · e52e1a403e
commit e52e1a403e
parent 1f2809d23e
5 changed files with 1914 additions and 129 deletions
--- a/oogabooga/base.c
+++ b/oogabooga/base.c
@ -60,86 +60,27 @@ void printf(const char* fmt, ...);

 #define ZERO(t) (t){0}

-///
-// Compiler specific stuff
-// We make inline actually inline.
-#ifdef _MSC_VER
-    // Microsoft Visual C++
-    #define inline __forceinline
+
+#ifdef __clang__
+    // Clang/LLVM
+    #define inline __attribute__((always_inline)) inline
    #define COMPILER_HAS_MEMCPY_INTRINSICS 1
-    #include <intrin.h>
-    #pragma intrinsic(__rdtsc)
-    inline u64 rdtsc() {
-        return __rdtsc();
-    }
+    #define COMPILER_CLANG 1
 #elif defined(__GNUC__) || defined(__GNUG__)
    // GNU GCC/G++
    #define inline __attribute__((always_inline)) inline
    #define COMPILER_HAS_MEMCPY_INTRINSICS 1
-    inline u64 rdtsc() {
-        unsigned int lo, hi;
-        __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
-        return ((u64)hi << 32) | lo;
-    }
-#elif defined(__clang__)
-    // Clang/LLVM
-    #define inline __attribute__((always_inline)) inline
-    #define COMPILER_HAS_MEMCPY_INTRINSICS 1
-    inline u64 rdtsc() {
-        unsigned int lo, hi;
-        __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
-        return ((u64)hi << 32) | lo;
-    }
-#elif defined(__INTEL_COMPILER) || defined(__ICC)
-    // Intel C++ Compiler
+    #define COMPILER_GCC 1
+#elif defined(_MSC_VER)
+    // Microsoft Visual C++
    #define inline __forceinline
    #define COMPILER_HAS_MEMCPY_INTRINSICS 1
-    inline u64 rdtsc() {
-        return __rdtsc();
-    }
-#elif defined(__BORLANDC__)
-    // Borland C++
-    #define inline __inline
-    inline u64 rdtsc() {
-        unsigned int lo, hi;
-        __asm {
-            rdtsc
-            mov lo, eax
-            mov hi, edx
-        }
-        return ((u64)hi << 32) | lo;
-    }
-#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-    // Oracle Solaris Studio
-    #define inline inline __attribute__((always_inline))
-    inline u64 rdtsc() {
-        unsigned int lo, hi;
-        asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
-        return ((u64)hi << 32) | lo;
-    }
-#elif defined(__IBMC__) || defined(__IBMCPP__)
-    // IBM XL C/C++ Compiler
-    #define inline __attribute__((always_inline)) inline
-    #define COMPILER_HAS_MEMCPY_INTRINSICS 1
-    inline u64 rdtsc() {
-        unsigned int lo, hi;
-        __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
-        return ((u64)hi << 32) | lo;
-    }
-#elif defined(__PGI)
-    // Portland Group Compiler
-    #define inline inline __attribute__((always_inline))
-    inline u64 rdtsc() {
-        unsigned int lo, hi;
-        asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
-        return ((u64)hi << 32) | lo;
-    }
+    #define COMPILER_MSVC 1
 #else
-    // Fallback for unknown compilers
    #define inline inline
+    #define COMPILER_HAS_MEMCPY_INTRINSICS 0
 #endif

-
 #define FIRST_ARG(arg1, ...) arg1
 #define SECOND_ARG(arg1, arg2, ...) arg2
 #define print(...) _Generic((FIRST_ARG(__VA_ARGS__)), \
--- a/oogabooga/cpu.c
+++ b/oogabooga/cpu.c
--- a/oogabooga/linmath.c
+++ b/oogabooga/linmath.c
@ -14,94 +14,128 @@
 #define to_degrees32 to_degrees

 typedef union Vector2 {
-	struct {float x, y;};
+	struct {float32 x, y;};
 } Vector2;
-inline Vector2 v2(float x, float y) { return (Vector2){x, y}; }
+inline Vector2 v2(float32 x, float32 y) { return (Vector2){x, y}; }
 #define v2_expand(v) (v).x, (v).y

 typedef union Vector3 {
-	struct {float x, y, z;};
-	struct {float r, g, b;};
+	struct {float32 x, y, z;};
+	struct {float32 r, g, b;};
 	struct {Vector2 xy;};
-	struct {float _x; Vector2 yz;};
+	struct {float32 _x; Vector2 yz;};
 } Vector3;
-inline Vector3 v3(float x, float y, float z) { return (Vector3){x, y, z}; }
+inline Vector3 v3(float32 x, float32 y, float32 z) { return (Vector3){x, y, z}; }
 #define v3_expand(v) (v).x, (v).y, (v).z

 typedef union Vector4 {
-	struct {float x, y, z, w;};
-	struct {float x1, y1, x2, y2;};
-	struct {float r, g, b, a;};
-	struct {float left, bottom, right, top;};
+	struct {float32 x, y, z, w;};
+	struct {float32 x1, y1, x2, y2;};
+	struct {float32 r, g, b, a;};
+	struct {float32 left, bottom, right, top;};
 	struct {Vector2 xy; Vector2 zw;};
 	struct {Vector3 xyz;};
-	struct {float _x; Vector3 yzw;};
+	struct {float32 _x; Vector3 yzw;};
 } Vector4;
-inline Vector4 v4(float x, float y, float z, float w) { return (Vector4){x, y, z, w}; }
+inline Vector4 v4(float32 x, float32 y, float32 z, float32 w) { return (Vector4){x, y, z, w}; }
 #define v4_expand(v) (v).x, (v).y, (v).z, (v).w

-// #Simd #Speed
-
 inline Vector2 v2_add(Vector2 a, Vector2 b) {
-	return v2(a.x + b.x, a.y + b.y);
-}
-inline Vector3 v3_add(Vector3 a, Vector3 b) {
-	return v3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-inline Vector4 v4_add(Vector4 a, Vector4 b) {
-	return v4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+	simd_add_float32_64((f32*)&a, (f32*)&b, (f32*)&a);
+	return a;
 }
 inline Vector2 v2_sub(Vector2 a, Vector2 b) {
-	return v2(a.x - b.x, a.y - b.y);
-}
-inline Vector3 v3_sub(Vector3 a, Vector3 b) {
-	return v3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-inline Vector4 v4_sub(Vector4 a, Vector4 b) {
-	return v4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+	simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a);
+	return a;
 }
 inline Vector2 v2_mul(Vector2 a, Vector2 b) {
-	return v2(a.x * b.x, a.y * b.y);
+	simd_mul_float32_64((f32*)&a, (f32*)&b, (f32*)&a);
+	return a;
 }
-inline Vector2 v2_mulf(Vector2 a, float b) {
-	return v2(a.x * b, a.y * b);
-}
-inline Vector3 v3_mul(Vector3 a, Vector3 b) {
-	return v3(a.x * b.x, a.y * b.y, a.z * b.z);
-}
-inline Vector4 v4_mul(Vector4 a, Vector4 b) {
-	return v4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+inline Vector2 v2_mulf(Vector2 a, float32 s) {
+	return v2_mul(a, v2(s, s));
 }
 inline Vector2 v2_div(Vector2 a, Vector2 b) {
-	return v2(a.x / b.x, a.y / b.y);
+	simd_div_float32_64((f32*)&a, (f32*)&b, (f32*)&a);
+	return a;
+}
+inline Vector2 v2_divf(Vector2 a, float32 s) {
+	return v2_div(a, v2(s, s));
+}
+
+inline Vector3 v3_add(Vector3 a, Vector3 b) {
+	simd_add_float32_64((f32*)&a, (f32*)&b, (f32*)&a);
+	a.z += b.z;
+	return a;
+}
+inline Vector3 v3_sub(Vector3 a, Vector3 b) {
+	simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a);
+	a.z -= b.z;
+	return a;
+}
+inline Vector3 v3_mul(Vector3 a, Vector3 b) {
+	simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a);
+	a.z *= b.z;
+	return a;
+}
+inline Vector3 v3_mulf(Vector3 a, float32 s) {
+	return v3_mul(a, v3(s, s, s));
 }
 inline Vector3 v3_div(Vector3 a, Vector3 b) {
-	return v3(a.x / b.x, a.y / b.y, a.z / b.z);
+	simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a);
+	a.z /= b.z;
+	return a;
+}
+inline Vector3 v3_divf(Vector3 a, float32 s) {
+	return v3_div(a, v3(s, s, s));
+}
+
+inline Vector4 v4_add(Vector4 a, Vector4 b) {
+	simd_add_float32_128((f32*)&a, (f32*)&b, (f32*)&a);
+	return a;
+}
+inline Vector4 v4_sub(Vector4 a, Vector4 b) {
+	simd_sub_float32_128((f32*)&a, (f32*)&b, (f32*)&a);
+	return a;
+}
+inline Vector4 v4_mul(Vector4 a, Vector4 b) {
+	simd_mul_float32_128((f32*)&a, (f32*)&b, (f32*)&a);
+	return a;
+}
+inline Vector4 v4_mulf(Vector4 a, float32 s) {
+	return v4_mul(a, v4(s, s, s, s));
 }
 inline Vector4 v4_div(Vector4 a, Vector4 b) {
-	return v4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+	simd_div_float32_128((f32*)&a, (f32*)&b, (f32*)&a);
+	return a;
 }
+inline Vector4 v4_divf(Vector4 a, float32 s) {
+	return v4_div(a, v4(s, s, s, s));
+}
+
 inline Vector2 v2_normalize(Vector2 a) {
-    float length = sqrt(a.x * a.x + a.y * a.y);
+    float32 length = sqrt(a.x * a.x + a.y * a.y);
    if (length == 0) {
        return (Vector2){0, 0};
    }
-    return (Vector2){a.x / length, a.y / length};
+    return v2_divf(a, length);
 }


-Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float rotation_radians) {
-    float s = sin(rotation_radians);
-    float c = cos(rotation_radians);
+Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) {
+    float32 s = sin(rotation_radians);
+    float32 c = cos(rotation_radians);

    point.x -= pivot.x;
    point.y -= pivot.y;
+    point = v2_sub(point, pivot);

-    float x_new = point.x * c - point.y * s;
-    float y_new = point.x * s + point.y * c;
+    float32 x_new = point.x * c - point.y * s;
+    float32 y_new = point.x * s + point.y * c;

    point.x = x_new + pivot.x;
    point.y = y_new + pivot.y;
+    point = v2_add(v2(x_new, y_new), pivot);

    return point;
 }
@ -112,10 +146,10 @@ Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float rotatio


 typedef struct Matrix4 {
-    union {float m[4][4]; float data[16]; };
+    union {float32 m[4][4]; float32 data[16]; };
 } Matrix4;

-Matrix4 m4_scalar(float scalar) {
+Matrix4 m4_scalar(float32 scalar) {
    Matrix4 m;
    for (int i = 0; i < 16; i++) {
        m.data[i] = 0.0f;
@ -136,11 +170,11 @@ Matrix4 m4_make_translation(Vector3 translation) {
    return m;
 }

-Matrix4 m4_make_rotation(Vector3 axis, float radians) {
+Matrix4 m4_make_rotation(Vector3 axis, float32 radians) {
    Matrix4 m = m4_scalar(1.0);
-    float c = cosf(radians);
-    float s = sinf(radians);
-    float t = 1.0f - c;
+    float32 c = cosf(radians);
+    float32 s = sinf(radians);
+    float32 t = 1.0f - c;

    m.m[0][0] = c + axis.x * axis.x * t;
    m.m[0][1] = axis.x * axis.y * t + axis.z * s;
@ -158,7 +192,7 @@ Matrix4 m4_make_rotation(Vector3 axis, float radians) {
    return m;
 }

-inline Matrix4 m4_make_rotation_z(float radians) {
+inline Matrix4 m4_make_rotation_z(float32 radians) {
 	return m4_make_rotation(v3(0, 0, 1), radians);
 }

@ -189,11 +223,11 @@ inline Matrix4 m4_translate(Matrix4 m, Vector3 translation) {
    return m4_multiply(m, translation_matrix);
 }

-inline Matrix4 m4_rotate(Matrix4 m, Vector3 axis, float radians) {
+inline Matrix4 m4_rotate(Matrix4 m, Vector3 axis, float32 radians) {
    Matrix4 rotation_matrix = m4_make_rotation(axis, radians);
    return m4_multiply(m, rotation_matrix);
 }
-inline Matrix4 m4_rotate_z(Matrix4 m, float radians) {
+inline Matrix4 m4_rotate_z(Matrix4 m, float32 radians) {
    Matrix4 rotation_matrix = m4_make_rotation(v3(0, 0, 1), radians);
    return m4_multiply(m, rotation_matrix);
 }
@ -205,7 +239,7 @@ inline Matrix4 m4_scale(Matrix4 m, Vector3 scale) {


 // _near & _far because microsoft...
-Matrix4 m4_make_orthographic_projection(float left, float right, float bottom, float top, float _near, float _far) {
+Matrix4 m4_make_orthographic_projection(float32 left, float32 right, float32 bottom, float32 top, float32 _near, float32 _far) {
    Matrix4 m = m4_scalar(1.0f);
    m.m[0][0] = 2.0f / (right - left);
    m.m[1][1] = 2.0f / (top - bottom);
@ -227,7 +261,7 @@ Vector4 m4_transform(Matrix4 m, Vector4 v) {
 }
 Matrix4 m4_inverse(Matrix4 m) {
    Matrix4 inv;
-    float det;
+    float32 det;

    inv.m[0][0] = m.m[1][1] * m.m[2][2] * m.m[3][3] -
                  m.m[1][1] * m.m[2][3] * m.m[3][2] -
--- a/oogabooga/oogabooga.c
+++ b/oogabooga/oogabooga.c
@ -22,6 +22,11 @@
 	#define DO_ZERO_INITIALIZATION 1
 #endif

+#ifndef ENABLE_SIMD
+	#define ENABLE_SIMD 1
+#endif
+
+
 #define WINDOWS 0
 #define LINUX   1
 #define MACOS   2
@ -68,7 +73,7 @@ void lodepng_free(void* ptr) {

 /////

-
+#include "cpu.c"

 #ifdef _WIN32
 	#include <Windows.h>
@ -211,10 +216,21 @@ void _profiler_report_time_cycles(string name, u64 count, u64 start) {
 void oogabooga_init(u64 program_memory_size) {
 	context.logger = default_logger;
 	temp = get_initialization_allocator();
+	init_cpu_specific();
+	Cpu_Capabilities features = query_cpu_capabilities();
 	os_init(program_memory_size);
 	heap_init();
 	temporary_storage_init();
 	gfx_init();
+	log_verbose("CPU has sse1: %cs", features.sse1 ? "true" : "false");
+	log_verbose("CPU has sse2: %cs", features.sse2 ? "true" : "false");
+	log_verbose("CPU has sse3: %cs", features.sse3 ? "true" : "false");
+	log_verbose("CPU has ssse3: %cs", features.ssse3 ? "true" : "false");
+	log_verbose("CPU has sse41: %cs", features.sse41 ? "true" : "false");
+	log_verbose("CPU has sse42: %cs", features.sse42 ? "true" : "false");
+	log_verbose("CPU has avx: %cs", features.avx ? "true" : "false");
+	log_verbose("CPU has avx2: %cs", features.avx2 ? "true" : "false");
+	log_verbose("CPU has avx512: %cs", features.avx512 ? "true" : "false");
 }

 #ifndef INITIAL_PROGRAM_MEMORY_SIZE
@ -229,7 +245,6 @@ int ENTRY_PROC(int argc, char **argv);

 int main(int argc, char **argv) {

-
 	printf("Ooga booga program started\n");
 	oogabooga_init(INITIAL_PROGRAM_MEMORY_SIZE); 
 	
--- a/oogabooga/tests.c
+++ b/oogabooga/tests.c
@ -539,8 +539,202 @@ void test_file_io() {
    delete_ok = os_file_delete("integers");
    assert(delete_ok, "Failed: could not delete integers"); 
 }
+void test_simd() {

+	u64 start = os_get_current_cycle_count();
+
+    // Setup test data
+    float32 a_f32[32], b_f32[32], result_f32[32];
+    s32 a_i32[16], b_i32[16], result_i32[16];
+    
+    for (int i = 0; i < 16; ++i) {
+        a_f32[i] = i * 1.0f;
+        b_f32[i] = (i + 1) * 2.0f;
+        a_i32[i] = i;
+        b_i32[i] = i + 1;
+    }
+    
+    // Test function pointers setup
+    query_cpu_capabilities();
+    
+    // Test float32 add
+    simd_add_float32_64(a_f32, b_f32, result_f32);
+    assert(result_f32[0] == a_f32[0]+b_f32[0], "SIMD add float32 64 failed");
+
+    simd_add_float32_128(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 4; ++i) {
+        assert(result_f32[i] == a_f32[i] + b_f32[i], "SIMD add float32 128 failed");
+    }
+
+    simd_add_float32_256(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 8; ++i) {
+        assert(result_f32[i] == a_f32[i] + b_f32[i], "SIMD add float32 256 failed");
+    }
+
+    simd_add_float32_512(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_f32[i] == a_f32[i] + b_f32[i], "SIMD add float32 512 failed");
+    }
+
+    // Test float32 subtract
+    simd_sub_float32_64(a_f32, b_f32, result_f32);
+    assert(result_f32[0] == a_f32[0]-b_f32[0], "SIMD sub float32 64 failed");
+
+    simd_sub_float32_128(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 4; ++i) {
+        assert(result_f32[i] == a_f32[i] - b_f32[i], "SIMD sub float32 128 failed");
+    }
+
+    simd_sub_float32_256(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 8; ++i) {
+        assert(result_f32[i] == a_f32[i] - b_f32[i], "SIMD sub float32 256 failed");
+    }
+
+    simd_sub_float32_512(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_f32[i] == a_f32[i] - b_f32[i], "SIMD sub float32 512 failed");
+    }
+
+    // Test float32 multiply
+    simd_mul_float32_64(a_f32, b_f32, result_f32);
+    assert(result_f32[0] == a_f32[0]*b_f32[0], "SIMD mul float32 64 failed");
+
+    simd_mul_float32_128(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 4; ++i) {
+        assert(result_f32[i] == a_f32[i] * b_f32[i], "SIMD mul float32 128 failed");
+    }
+
+    simd_mul_float32_256(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 8; ++i) {
+        assert(result_f32[i] == a_f32[i] * b_f32[i], "SIMD mul float32 256 failed");
+    }
+
+    simd_mul_float32_512(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_f32[i] == a_f32[i] * b_f32[i], "SIMD mul float32 512 failed");
+    }
+
+    // Test float32 divide
+    simd_div_float32_64(a_f32, b_f32, result_f32);
+    assert(result_f32[0] == a_f32[0]/b_f32[0], "SIMD div float32 64 failed");
+
+    simd_div_float32_128(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 4; ++i) {
+        assert(result_f32[i] == a_f32[i] / b_f32[i], "SIMD div float32 128 failed");
+    }
+
+    simd_div_float32_256(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 8; ++i) {
+        assert(result_f32[i] == a_f32[i] / b_f32[i], "SIMD div float32 256 failed");
+    }
+
+    simd_div_float32_512(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_f32[i] == a_f32[i] / b_f32[i], "SIMD div float32 512 failed");
+    }
+
+    // Test int32 add
+    simd_add_int32_128(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 4; ++i) {
+        assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 128 failed");
+    }
+
+    simd_add_int32_256(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 8; ++i) {
+        assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 256 failed");
+    }
+
+    simd_add_int32_512(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 512 failed");
+    }
+
+    // Test int32 subtract
+    simd_sub_int32_128(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 4; ++i) {
+        assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 128 failed");
+    }
+
+    simd_sub_int32_256(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 8; ++i) {
+        assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 256 failed");
+    }
+
+    simd_sub_int32_512(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 512 failed");
+    }
+
+    // Test int32 multiply
+    simd_mul_int32_128(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 4; ++i) {
+        assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 128 failed");
+    }
+
+    simd_mul_int32_256(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 8; ++i) {
+        assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 256 failed");
+    }
+
+    simd_mul_int32_512(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 512 failed");
+    }
+
+    // Stress test with random values
+    for (int i = 0; i < 16; ++i) {
+        a_f32[i] = (float32)get_random();
+        b_f32[i] = (float32)get_random();
+        a_i32[i] = (s32)get_random();
+        b_i32[i] = (s32)get_random();
+    }
+
+    simd_add_float32_512(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_f32[i] == a_f32[i] + b_f32[i], "SIMD add float32 stress test failed");
+    }
+
+    simd_sub_float32_512(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_f32[i] == a_f32[i] - b_f32[i], "SIMD sub float32 stress test failed");
+    }
+
+    simd_mul_float32_512(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_f32[i] == a_f32[i] * b_f32[i], "SIMD mul float32 stress test failed");
+    }
+
+    simd_div_float32_512(a_f32, b_f32, result_f32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_f32[i] == a_f32[i] / b_f32[i], "SIMD div float32 stress test failed");
+    }
+
+    simd_add_int32_512(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 stress test failed");
+    }
+
+    simd_sub_int32_512(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 stress test failed");
+    }
+
+    simd_mul_int32_512(a_i32, b_i32, result_i32);
+    for (int i = 0; i < 16; ++i) {
+        assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 stress test failed");
+    }
+    
+    u64 end = os_get_current_cycle_count();
+    
+    u64 cycles = end-start;
+    
+    print(" simd took %llu cycles ", cycles);
+}
 void oogabooga_run_tests() {
+	print("Testing simd... ");
+	test_simd();
+	print("OK!\n");
+	
 	print("Testing allocator... ");
 	test_allocator(true);
 	print("OK!\n");
@ -553,6 +747,7 @@ void oogabooga_run_tests() {
 	test_strings();
 	print("OK!\n");

+	
 	print("Thread bombing allocator... ");
 	Thread* threads[100];
 	for (int i = 0; i < 100; i++) {
@ -567,4 +762,5 @@ void oogabooga_run_tests() {
 	print("Testing file IO... ");
 	test_file_io();
 	print("OK!\n");
+	
 }