- Replace lodepng with stb_image (& add stb_truetype for fonts)

- Fix d3d11 Input assembler not being created correctly bug - Fix framerate being locked by swap chain present - Move enable_vsync to window - sqrt & rsqrt simd - Add release build & run in vscode tasks & launch - Cleanup
2024-07-04 20:56:27 +02:00 · 2024-07-04 20:56:27 +02:00 · 05919248eb
commit 05919248eb
parent 4c5f882999
26 changed files with 13557 additions and 305 deletions
--- a/.gitignore
+++ b/.gitignore
@ -54,4 +54,6 @@ test_doc.vkn
 *keybinds
 *.rdi

-google_trace.json
+google_trace.json
+
+build/*
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -2,7 +2,7 @@
    "version": "0.2.0",
    "configurations": [
        {
-            "name": "Launch with MSVC Debugger",
+            "name": "Launch Debug with MSVC Debugger",
            "type": "cppvsdbg",
            "request": "launch",
            "program": "${workspaceFolder}/build/cgame.exe", // Run the output executable after compile
@ -11,7 +11,19 @@
            "cwd": "${workspaceFolder}",
            "environment": [],
            "console":"integratedTerminal",
-            // "preLaunchTask": "Compile"
+            "preLaunchTask": "Compile"
+        },
+        {
+            "name": "Launch Release with MSVC Debugger",
+            "type": "cppvsdbg",
+            "request": "launch",
+            "program": "${workspaceFolder}/build/release/cgame.exe", // Run the output executable after compile
+            "args": [],
+            "stopAtEntry": false,
+            "cwd": "${workspaceFolder}",
+            "environment": [],
+            "console":"integratedTerminal",
+            "preLaunchTask": "Compile Release"
        }
    ]
 }
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@ -16,6 +16,21 @@
                // "close": false,
                // "showReuseMessage": true,
            }
-	}
+	    },
+        {
+            "label": "Compile Release",
+            "type": "shell",
+            "command": "${workspaceFolder}\\build_release",
+            "group": {
+                "kind": "build"
+            },
+            "problemMatcher": ["$gcc"],
+            "presentation": {
+                "clear": true,
+                // "revealProblems": "onProblem",
+                // "close": false,
+                // "showReuseMessage": true,
+            }
+	    }
    ]
 }
--- a/build.bat
+++ b/build.bat
@ -6,6 +6,6 @@ mkdir build

 pushd build

-clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration  -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32  -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -msse4.1
+clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations -lkernel32 -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi 

 popd
--- a/build.c
+++ b/build.c
@ -3,7 +3,7 @@
 ///
 // Build config stuff

-#define RUN_TESTS 1
+#define RUN_TESTS 0

 // This is only for people developing oogabooga!
 #define OOGABOOGA_DEV 1
@ -13,6 +13,7 @@
 // ENABLE_SIMD Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which doesn't
 #define ENABLE_SIMD 1

+
 #define INITIAL_PROGRAM_MEMORY_SIZE MB(5)

 typedef struct Context_Extra {
@ -21,8 +22,6 @@ typedef struct Context_Extra {
 // This needs to be defined before oogabooga if we want extra stuff in context
 #define CONTEXT_EXTRA Context_Extra

-#define GFX_RENDERER GFX_RENDERER_D3D11
-
 // This defaults to "entry", but we can set it to anything (except "main" or other existing proc names"
 #define ENTRY_PROC entry

@ -38,13 +37,13 @@ typedef struct Context_Extra {
 //

 // this is a minimal starting point for new projects. Copy & rename to get started
-#include "oogabooga/examples/minimal_game_loop.c"
+//  #include "oogabooga/examples/minimal_game_loop.c"

 // An engine dev stress test for rendering
 // #include "oogabooga/examples/renderer_stress_test.c"

 // Randy's example game that he's building out as a tutorial for using the engine
-// #include "entry_randygame.c"
+#include "entry_randygame.c"

 // This is where you swap in your own project!
 // #include "entry_yourepicgamename.c"
--- a/build_dissassembly.bat
+++ b/build_dissassembly.bat
@ -1,14 +1,18 @@

@echo off
-rmdir /S /Q build
-mkdir build
+if exist build/dissassembly (
+  rmdir /s /q build
+)
+if not exist build  (
+	mkdir build
+)

 pushd build

-mkdir release
-pushd release
+mkdir dissassembly
+pushd dissassembly

-clang -o cgame.asm ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration  -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -ffast-math -funroll-loops -finline-functions -fvectorize -fslp-vectorize -fomit-frame-pointer -fno-exceptions -fno-rtti -S -masm=intel
+clang -o cgame.asm ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration  -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations  -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize  -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -S -masm=intel

 popd
 popd
--- a/build_release.bat
+++ b/build_release.bat
@ -1,5 +1,7 @@
@echo off
-rmdir /S /Q build
+if exist build (
+  rmdir /s /q build
+)
 mkdir build

 pushd build
@ -7,7 +9,7 @@ pushd build
 mkdir release
 pushd release

-clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration  -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32  -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi  -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize  -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -msse4.1
+clang -o cgame.exe ../../build.c -Ofast -DNDEBUG -std=c11 -Wextra -Wno-incompatible-library-redeclaration  -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations -lgdi32 -luser32  -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi  -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize  -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions

 popd
 popd
--- a/oogabooga/base.c
+++ b/oogabooga/base.c
@ -10,37 +10,22 @@
 #define local_persist static

 #define forward_global extern
-
-// Haters gonna hate
-#define If if (
-#define then )
-// If cond then {}
-
-#ifdef _MSC_VER
-	inline void os_break() {
-		__debugbreak();
-		volatile int *a = 0;
-		*a = 5;
-	}
-#else
-	#error "Only msvc compiler supported at the moment";
-#endif
 	
 	
 void printf(const char* fmt, ...);
 #define ASSERT_STR_HELPER(x) #x
 #define ASSERT_STR(x) ASSERT_STR_HELPER(x)
-#define assert_line(line, cond, ...) if(!(cond)) { printf("Assertion failed in file " __FILE__ " on line " ASSERT_STR(line) "\nFailed Condition: " #cond ". Message: " __VA_ARGS__); os_break(); }
-#define assert(cond, ...) assert_line(__LINE__, cond, __VA_ARGS__);
+#define assert_line(line, cond, ...) {if(!(cond)) { printf("Assertion failed in file " __FILE__ " on line " ASSERT_STR(line) "\nFailed Condition: " #cond ". Message: " __VA_ARGS__); crash(); }}
+#define assert(cond, ...) {assert_line(__LINE__, cond, __VA_ARGS__)}

 #define DEFER(start, end) for(int _i_ = ((start), 0); _i_ == 0; _i_ += 1, (end))

 #if CONFIGURATION == RELEASE
 #undef assert
-#define assert(...)
+#define assert(...) (void)0;
 #endif

-#define panic(...) { print(__VA_ARGS__); os_break(); }
+#define panic(...) { print(__VA_ARGS__); crash(); }

 #define cast(t) (t)

@ -48,7 +33,6 @@ void printf(const char* fmt, ...);



-
 #define FIRST_ARG(arg1, ...) arg1
 #define SECOND_ARG(arg1, arg2, ...) arg2
 #define print(...) _Generic((FIRST_ARG(__VA_ARGS__)), \
--- a/oogabooga/cpu.c
+++ b/oogabooga/cpu.c
@ -29,6 +29,11 @@ typedef struct Cpu_Capabilities {
 	#define inline __forceinline
 	#define alignat(x) __declspec(align(x))
    #define COMPILER_HAS_MEMCPY_INTRINSICS 1
+    inline void crash() {
+		__debugbreak();
+		volatile int *a = 0;
+		*a = 5;
+	}
    #include <intrin.h>
    #pragma intrinsic(__rdtsc)
    inline u64 rdtsc() {
@ -66,6 +71,11 @@ typedef struct Cpu_Capabilities {
 	#define inline __attribute__((always_inline)) inline
 	#define alignat(x) __attribute__((aligned(x)))
    #define COMPILER_HAS_MEMCPY_INTRINSICS 1
+    inline void crash() {
+		__builtin_trap();
+		volatile int *a = 0;
+		*a = 5;
+	}
    inline u64 rdtsc() {
        unsigned int lo, hi;
        __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
@ -119,7 +129,6 @@ typedef struct Cpu_Capabilities {
    #warning "Compiler is not explicitly supported, some things will probably not work as expected"
 #endif

-
 Cpu_Capabilities query_cpu_capabilities() {
    Cpu_Capabilities result = {0};

--- a/oogabooga/d3d11_image_shader_bytecode.c
+++ b/oogabooga/d3d11_image_shader_bytecode.c
@ -3,10 +3,10 @@

 struct VS_INPUT
 {
+    float4 position : POSITION;
    float2 uv : TEXCOORD;
    float4 color : COLOR;
    int texture_index: TEXTURE_INDEX;
-    float4 position : POSITION;
 };

 struct PS_INPUT
@ -79,8 +79,8 @@ float4 ps_main(PS_INPUT input) : SV_TARGET
 */

 const u8 IMAGE_SHADER_VERTEX_BLOB_BYTES[]= {
-0x44, 0x58, 0x42, 0x43, 0xdd, 0x02, 0x55, 0xb0, 0x7b, 0x83, 0x6c, 0x34, 0x45, 0xe8, 0x51, 0xd4, 
-0x76, 0xbf, 0x66, 0x77, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x03, 0x00, 0x00, 0x05, 0x00, 0x00, 
+0x44, 0x58, 0x42, 0x43, 0xf4, 0xea, 0x50, 0x9f, 0xcf, 0xeb, 0x01, 0x7b, 0x78, 0x58, 0xd5, 0x6b, 
+0x4f, 0x9f, 0xc1, 0xe2, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x03, 0x00, 0x00, 0x05, 0x00, 0x00, 
 0x00, 0x34, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x38, 0x01, 0x00, 0x00, 0xd4, 0x01, 
 0x00, 0x00, 0xa0, 0x02, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00, 0x00, 
 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 
@ -92,14 +92,14 @@ const u8 IMAGE_SHADER_VERTEX_BLOB_BYTES[]= {
 0x6c, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2e, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4e, 0x90, 0x00, 
 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x00, 
 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-0x03, 0x03, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00, 0x00, 0x77, 0x00, 
-0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 
-0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
-0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00, 
-0x00, 0x54, 0x45, 0x58, 0x43, 0x4f, 0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52, 
-0x00, 0x54, 0x45, 0x58, 0x54, 0x55, 0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00, 
-0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x00, 0xab, 0xab, 0x4f, 0x53, 0x47, 0x4e, 
+0x0f, 0x0f, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
+0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x7a, 0x00, 
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 
+0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
+0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 
+0x00, 0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x00, 0x54, 0x45, 0x58, 0x43, 0x4f, 
+0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52, 0x00, 0x54, 0x45, 0x58, 0x54, 0x55, 
+0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00, 0xab, 0xab, 0x4f, 0x53, 0x47, 0x4e, 
 0x94, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 
 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 
 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
@ -111,19 +111,19 @@ const u8 IMAGE_SHADER_VERTEX_BLOB_BYTES[]= {
 0x54, 0x45, 0x58, 0x43, 0x4f, 0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52, 0x00, 
 0x54, 0x45, 0x58, 0x54, 0x55, 0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00, 0xab, 
 0xab, 0xab, 0x53, 0x48, 0x45, 0x58, 0xc4, 0x00, 0x00, 0x00, 0x50, 0x00, 0x01, 0x00, 0x31, 
-0x00, 0x00, 0x00, 0x6a, 0x08, 0x00, 0x01, 0x5f, 0x00, 0x00, 0x03, 0x32, 0x10, 0x10, 0x00, 
-0x00, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00, 
-0x00, 0x5f, 0x00, 0x00, 0x03, 0x12, 0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x5f, 0x00, 
-0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x04, 0xf2, 
+0x00, 0x00, 0x00, 0x6a, 0x08, 0x00, 0x01, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 
+0x00, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x03, 0x32, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00, 
+0x00, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x5f, 0x00, 
+0x00, 0x03, 0x12, 0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x04, 0xf2, 
 0x20, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03, 
 0x32, 0x20, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03, 0xf2, 0x20, 0x10, 
 0x00, 0x02, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03, 0x12, 0x20, 0x10, 0x00, 0x03, 0x00, 
 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0xf2, 0x20, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46, 
-0x1e, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x32, 0x20, 0x10, 0x00, 
-0x01, 0x00, 0x00, 0x00, 0x46, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 
-0x05, 0xf2, 0x20, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x1e, 0x10, 0x00, 0x01, 0x00, 
+0x1e, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x32, 0x20, 0x10, 0x00, 
+0x01, 0x00, 0x00, 0x00, 0x46, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 
+0x05, 0xf2, 0x20, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x1e, 0x10, 0x00, 0x02, 0x00, 
 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x12, 0x20, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0a, 
-0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, 
+0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54, 
 0x94, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
--- a/oogabooga/dev/d3d11_image_shader.hlsl
+++ b/oogabooga/dev/d3d11_image_shader.hlsl
@ -1,9 +1,9 @@
 struct VS_INPUT
 {
+    float4 position : POSITION;
    float2 uv : TEXCOORD;
    float4 color : COLOR;
    int texture_index: TEXTURE_INDEX;
-    float4 position : POSITION;
 };

 struct PS_INPUT
--- a/oogabooga/drawing.c
+++ b/oogabooga/drawing.c
@ -197,65 +197,39 @@ Draw_Quad *draw_image_xform(Gfx_Image *image, Matrix4 xform, Vector2 size, Vecto
 #define COLOR_BLACK ((Vector4){0.0, 0.0, 0.0, 1.0})

 Gfx_Image *load_image_from_disk(string path, Allocator allocator) {
-	string png;
-	bool ok = os_read_entire_file(path, &png, allocator);
-	if (!ok) return 0;
+    string png;
+    bool ok = os_read_entire_file(path, &png, allocator);
+    if (!ok) return 0;

-	Gfx_Image *image = alloc(allocator, sizeof(Gfx_Image));
-	
-	// This is fucking terrible I gotta write my own decoder
-
-	lodepng_allocator = allocator;
-
-	LodePNGState state;
-	lodepng_state_init(&state);
-	u32 error = lodepng_inspect(&image->width, &image->height, &state, png.data, png.count);
-	if (error) {
-		return 0;
-	}
-	
-	// 5 lines of code to say "ignore_adler32 = true" (because it's broken and gives me an error)
-	// I JUST WANT TO LOAD A PNG 
-	LodePNGDecoderSettings decoder;
-	lodepng_decoder_settings_init(&decoder);
-	lodepng_decompress_settings_init(&decoder.zlibsettings);
-	decoder.zlibsettings.ignore_adler32 = true;
-	state.decoder = decoder;
-	
-	error = lodepng_decode(&image->data, &image->width, &image->height, &state, png.data, png.count);
-	
-	lodepng_state_cleanup(&state);
-	
-	dealloc_string(allocator, png);
-	
-	if (error) {
-		return 0;
-	}
-	
-	// We need to flip the image
-	u32 row_bytes = image->width * 4;  // #Magicvalue assuming 4 bytes
-    u8* temp_row = (u8*)alloc(temp, row_bytes);
-    for (u32 i = 0; i < image->height / 2; i++) {
-        u8* top_row = image->data + i * row_bytes;
-        u8* bottom_row = image->data + (image->height - i - 1) * row_bytes;
-
-        // Swap the top row with the bottom row
-        memcpy(temp_row, top_row, row_bytes);
-        memcpy(top_row, bottom_row, row_bytes);
-        memcpy(bottom_row, temp_row, row_bytes);
+    Gfx_Image *image = alloc(allocator, sizeof(Gfx_Image));
+    
+    // Use stb_image to load and decode the PNG
+    int width, height, channels;
+    stbi_set_flip_vertically_on_load(1);  // stb_image can flip the image on load
+    unsigned char* stb_data = stbi_load_from_memory(png.data, png.count, &width, &height, &channels, STBI_rgb_alpha);
+    
+    if (!stb_data) {
+        dealloc(allocator, image);
+        dealloc_string(allocator, png);
+        return 0;
    }
-	
-	image->gfx_handle = GFX_INVALID_HANDLE; // This is handled in gfx
-	
-	image->allocator = allocator;
-	
-	return image;
+    
+    image->data = stb_data;
+    image->width = width;
+    image->height = height;
+    image->gfx_handle = GFX_INVALID_HANDLE;  // This is handled in gfx
+    image->allocator = allocator;
+
+    dealloc_string(allocator, png);
+
+    return image;
 }
+
 void delete_image(Gfx_Image *image) {
-	dealloc(image->allocator, image->data);
-	image->width = 0;
-	image->height = 0;
-	draw_frame.garbage_stack[draw_frame.garbage_stack_count] = image->gfx_handle;
-	draw_frame.garbage_stack_count += 1;
-	dealloc(image->allocator, image);
+    stbi_image_free(image->data);  // Free the image data allocated by stb_image
+    image->width = 0;
+    image->height = 0;
+    draw_frame.garbage_stack[draw_frame.garbage_stack_count] = image->gfx_handle;
+    draw_frame.garbage_stack_count += 1;
+    dealloc(image->allocator, image);
 }
--- a/oogabooga/examples/renderer_stress_test.c
+++ b/oogabooga/examples/renderer_stress_test.c
@ -15,8 +15,6 @@ int entry(int argc, char **argv) {
 	Gfx_Image *hammer_image = load_image_from_disk(STR("oogabooga/examples/hammer.png"), get_heap_allocator());
 	assert(hammer_image, "Failed loading hammer.png");
 	
-	Gfx_Font *font = load_font_From_disk(
-	
 	seed_for_random = os_get_current_cycle_count();
 	
 	const float64 fps_limit = 69000;
@ -36,7 +34,9 @@ int entry(int argc, char **argv) {
 			delta = now - last_time;
 		}
 		last_time = now;
-		os_update(); 
+		tm_scope_cycles("os_update") {
+			os_update(); 
+		}
 		
 		if (is_key_just_released(KEY_ESCAPE)) {
 			window.should_close = true;
@ -102,11 +102,10 @@ int entry(int argc, char **argv) {
 		
 		draw_image(bush_image, v2(0.65, 0.65), v2(0.2*sin(now), 0.2*sin(now)), COLOR_WHITE);
 		
-		draw_frame.font = STR("");
+		tm_scope_cycles("gfx_update") {
+			gfx_update();
+		}
 		
-		draw_text();
-		
-		gfx_update();
 		
 		if (is_key_just_released('E')) {
 			log("FPS: %.2f", 1.0 / delta);
--- a/oogabooga/gfx_impl_d3d11.c
+++ b/oogabooga/gfx_impl_d3d11.c
@ -13,10 +13,10 @@ const Gfx_Handle GFX_INVALID_HANDLE = 0;

 string temp_win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16);

-typedef struct D3D11_Vertex {
+typedef  struct alignat(16) D3D11_Vertex {
+	Vector4 color;
 	Vector4 position;
 	Vector2 uv;
-	Vector4 color;
 	int texture_index;
 } D3D11_Vertex;

@ -81,14 +81,19 @@ void CALLBACK d3d11_debug_callback(D3D11_MESSAGE_CATEGORY category, D3D11_MESSAG
 		case D3D11_MESSAGE_SEVERITY_CORRUPTION:
 		case D3D11_MESSAGE_SEVERITY_ERROR:
 			log_error(msg);
+			break;
 		case D3D11_MESSAGE_SEVERITY_WARNING:
 			log_warning(msg);
+			break;
 		case D3D11_MESSAGE_SEVERITY_INFO:
 			log_info(msg);
+			break;
 		case D3D11_MESSAGE_SEVERITY_MESSAGE:
 			log_verbose(msg);
+			break;
 		default:
 			log("Ligma");
+			break;
 	}
 }

@ -127,7 +132,8 @@ void d3d11_update_swapchain() {
 	if (create) {
 		DXGI_SWAP_CHAIN_DESC1 scd = ZERO(DXGI_SWAP_CHAIN_DESC1);
 		scd.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
-		//scd.BufferDesc.RefreshRate.Numerator = xx st.refresh_rate;
+		scd.Flags = DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING;
+		//scd.BufferDesc.RefreshRate.Numerator = 0;
 		//scd.BufferDesc.RefreshRate.Denominator = 1;
 		
 		scd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
@ -137,23 +143,23 @@ void d3d11_update_swapchain() {
 			scd.Scaling = DXGI_SCALING_STRETCH; // for compatability with 7
 		}
 		
+		
 		// Windows 10 allows to use DXGI_SWAP_EFFECT_FLIP_DISCARD
 		// for Windows 8 compatibility use DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL
 		// for Windows 7 compatibility use DXGI_SWAP_EFFECT_DISCARD
 		if (d3d11_feature_level >= D3D_FEATURE_LEVEL_11_0) {
 			// this is supported only on FLIP presentation model
 			scd.Scaling = DXGI_SCALING_NONE;
-			scd.SwapEffect = DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL;
+			scd.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD; 
 			scd.BufferCount = 3;
-			gfx._can_vsync = false;
 			log_verbose("Present mode is flip discard, 3 buffers");
 		} else {
 			scd.SwapEffect = DXGI_SWAP_EFFECT_DISCARD;
 			scd.BufferCount = 2;
-			gfx._can_vsync = true;
 			log_verbose("Present mode is discard, 2 buffers");
 		}
 		
+		
 		// Obtain DXGI factory from device
 		IDXGIDevice *dxgi_device;
 		hr = VTABLE(QueryInterface, d3d11_device, &IID_IDXGIDevice, cast(void**)&dxgi_device);
@ -224,7 +230,7 @@ void d3d11_update_swapchain() {

 void gfx_init() {

-	gfx.enable_vsync = false;
+	window.enable_vsync = false;

 	log_verbose("d3d11 gfx_init");

@ -426,42 +432,53 @@ void gfx_init() {

    log_verbose("Shaders created");

+
+
+	D3D11_INPUT_ELEMENT_DESC layout[4];
+	memset(layout, 0, sizeof(layout));
+	
+	layout[0].SemanticName = "POSITION";
+	layout[0].SemanticIndex = 0;
+	layout[0].Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
+	layout[0].InputSlot = 0;
+	layout[0].AlignedByteOffset = offsetof(D3D11_Vertex, position);
+	layout[0].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
+	layout[0].InstanceDataStepRate = 0;
+	
+	layout[1].SemanticName = "TEXCOORD";
+	layout[1].SemanticIndex = 0;
+	layout[1].Format = DXGI_FORMAT_R32G32_FLOAT;
+	layout[1].InputSlot = 0;
+	layout[1].AlignedByteOffset = offsetof(D3D11_Vertex, uv);
+	layout[1].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
+	layout[1].InstanceDataStepRate = 0;
+	
+	layout[2].SemanticName = "COLOR";
+	layout[2].SemanticIndex = 0;
+	layout[2].Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
+	layout[2].InputSlot = 0;
+	layout[2].AlignedByteOffset = offsetof(D3D11_Vertex, color);
+	layout[2].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
+	layout[2].InstanceDataStepRate = 0;
+	
+	layout[3].SemanticName = "TEXTURE_INDEX";
+	layout[3].SemanticIndex = 0;
+	layout[3].Format = DXGI_FORMAT_R32_SINT;
+	layout[3].InputSlot = 0;
+	layout[3].AlignedByteOffset = offsetof(D3D11_Vertex, texture_index);
+	layout[3].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
+	layout[3].InstanceDataStepRate = 0;
+	
+	hr = VTABLE(CreateInputLayout, d3d11_device, layout, 4, vs_buffer, vs_size, &d3d11_image_vertex_layout);
+	win32_check_hr(hr);
+
 #if OOGABOOGA_DEV
 	D3D11Release(vs_blob);
    D3D11Release(ps_blob);
 #endif

-	D3D11_INPUT_ELEMENT_DESC layout[4];
-	memset(layout, 0, sizeof(layout));
-	
-	layout[0] = (D3D11_INPUT_ELEMENT_DESC){
-		"POSITION", 0,
-		DXGI_FORMAT_R32G32B32A32_FLOAT, 0,
-		offsetof(D3D11_Vertex, position),
-		D3D11_INPUT_PER_VERTEX_DATA, 0
-	};
-	layout[1] = (D3D11_INPUT_ELEMENT_DESC){
-		"TEXCOORD", 0,
-		DXGI_FORMAT_R32G32_FLOAT, 0,
-		offsetof(D3D11_Vertex, uv),
-		D3D11_INPUT_PER_VERTEX_DATA, 0
-	};
-	layout[2] = (D3D11_INPUT_ELEMENT_DESC){
-		"COLOR", 0,
-		DXGI_FORMAT_R32G32B32A32_FLOAT, 0,
-		offsetof(D3D11_Vertex, color),
-		D3D11_INPUT_PER_VERTEX_DATA, 0
-	};
-	layout[3] = (D3D11_INPUT_ELEMENT_DESC){
-		"TEXTURE_INDEX", 0,
-		DXGI_FORMAT_R32_SINT, 0,
-		offsetof(D3D11_Vertex, texture_index),
-		D3D11_INPUT_PER_VERTEX_DATA, 0
-	};
-	
-	hr = VTABLE(CreateInputLayout, d3d11_device, layout, 4, vs_buffer, vs_size, &d3d11_image_vertex_layout);
-
 	log_info("D3D11 init done");
+	
 }

 void d3d11_draw_call(int number_of_rendered_quads, ID3D11ShaderResourceView **textures, u64 num_textures) {
@ -493,7 +510,6 @@ void d3d11_draw_call(int number_of_rendered_quads, ID3D11ShaderResourceView **te
 }

 void gfx_update() {
-
 	if (window.should_close) return;
 	
 	VTABLE(ClearRenderTargetView, d3d11_context, d3d11_window_render_target_view, (float*)&window.clear_color);
@ -501,59 +517,61 @@ void gfx_update() {

 	HRESULT hr;

-	///
-	// purge garbage
-	for (u64 i = 0; i < draw_frame.garbage_stack_count; i++) {
-		ID3D11ShaderResourceView *view = draw_frame.garbage_stack[i];
-		ID3D11Resource *resource = 0;
-		VTABLE(GetResource, view, &resource);
-		
-		ID3D11Texture2D *texture = 0;
-		hr = VTABLE(QueryInterface, resource, &IID_ID3D11Texture2D, (void**)&texture);
-		if (SUCCEEDED(hr)) {
-			D3D11Release(view);
-			D3D11Release(texture);
-			log("Destroyed an image");
-		} else {
-			panic("Unhandled D3D11 resource deletion");
+	tm_scope_cycles("Frame setup") {
+		///
+		// purge garbage
+		for (u64 i = 0; i < draw_frame.garbage_stack_count; i++) {
+			ID3D11ShaderResourceView *view = draw_frame.garbage_stack[i];
+			ID3D11Resource *resource = 0;
+			VTABLE(GetResource, view, &resource);
+			
+			ID3D11Texture2D *texture = 0;
+			hr = VTABLE(QueryInterface, resource, &IID_ID3D11Texture2D, (void**)&texture);
+			if (SUCCEEDED(hr)) {
+				D3D11Release(view);
+				D3D11Release(texture);
+				log("Destroyed an image");
+			} else {
+				panic("Unhandled D3D11 resource deletion");
+			}
+		}
+	
+		///
+		// Maybe resize swap chain
+		RECT client_rect;
+		bool ok = GetClientRect(window._os_handle, &client_rect);
+		assert(ok, "GetClientRect failed with error code %lu", GetLastError());
+		u32 window_width  = client_rect.right-client_rect.left;
+		u32 window_height = client_rect.bottom-client_rect.top;
+		if (window_width != d3d11_swap_chain_width || window_height != d3d11_swap_chain_height) {
+			d3d11_update_swapchain();
+		}
+	
+		///
+		// Maybe grow quad vbo
+		u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6;
+	
+		if (required_size > d3d11_quad_vbo_size) {
+			if (d3d11_quad_vbo) {
+				D3D11Release(d3d11_quad_vbo);
+				dealloc(get_heap_allocator(), d3d11_staging_quad_buffer);
+			}
+			D3D11_BUFFER_DESC desc = ZERO(D3D11_BUFFER_DESC);
+			desc.Usage = D3D11_USAGE_DYNAMIC; 
+			desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+			desc.ByteWidth = required_size;
+			desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
+			HRESULT hr = VTABLE(CreateBuffer, d3d11_device, &desc, 0, &d3d11_quad_vbo);
+			assert(SUCCEEDED(hr), "CreateBuffer failed");
+			d3d11_quad_vbo_size = required_size;
+			
+			d3d11_staging_quad_buffer = alloc(get_heap_allocator(), d3d11_quad_vbo_size);
+			assert((u64)d3d11_staging_quad_buffer%16 == 0);
+			
+			log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size);
 		}
 	}

-	///
-	// Maybe resize swap chain
-	RECT client_rect;
-	bool ok = GetClientRect(window._os_handle, &client_rect);
-	assert(ok, "GetClientRect failed with error code %lu", GetLastError());
-	u32 window_width  = client_rect.right-client_rect.left;
-	u32 window_height = client_rect.bottom-client_rect.top;
-	if (window_width != d3d11_swap_chain_width || window_height != d3d11_swap_chain_height) {
-		d3d11_update_swapchain();
-	}
-
-	///
-	// Maybe grow quad vbo
-	u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6;
-
-	if (required_size > d3d11_quad_vbo_size) {
-		if (d3d11_quad_vbo) {
-			D3D11Release(d3d11_quad_vbo);
-			dealloc(get_heap_allocator(), d3d11_staging_quad_buffer);
-		}
-		D3D11_BUFFER_DESC desc = ZERO(D3D11_BUFFER_DESC);
-		desc.Usage = D3D11_USAGE_DYNAMIC; 
-		desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
-		desc.ByteWidth = required_size;
-		desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
-		HRESULT hr = VTABLE(CreateBuffer, d3d11_device, &desc, 0, &d3d11_quad_vbo);
-		assert(SUCCEEDED(hr), "CreateBuffer failed");
-		d3d11_quad_vbo_size = required_size;
-		
-		d3d11_staging_quad_buffer = alloc(get_heap_allocator(), d3d11_quad_vbo_size);
-		
-		log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size);
-	}
-
-	f64 rest_before  = os_get_current_time_in_seconds();
 	if (draw_frame.num_blocks > 0) {
 		///
 		// Render geometry from into vbo quad list
@ -569,8 +587,8 @@ void gfx_update() {
 		Draw_Quad_Block *block = &first_block;
 		
 		tm_scope_cycles("Quad processing") {
-			while (block != 0 && block->num_quads > 0) tm_scope_cycles("ad2As") {
-				for (u64 i = 0; i < block->num_quads; i++) tm_scope_cycles("Single quad") {
+			while (block != 0 && block->num_quads > 0) tm_scope_cycles("Quad block") {
+				for (u64 i = 0; i < block->num_quads; i++)  {
 					
 					Draw_Quad *q = &block->quad_buffer[i];
 					
@ -620,7 +638,7 @@ void gfx_update() {
 								if (num_textures >= 32) {
 									// If max textures reached, make a draw call and start over
 									D3D11_MAPPED_SUBRESOURCE buffer_mapping;
-									VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_NO_OVERWRITE, 0, &buffer_mapping);
+									VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping);
 									memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
 									VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
 									d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
@ -676,30 +694,29 @@ void gfx_update() {
 			}
 		}
 		
-	    D3D11_MAPPED_SUBRESOURCE buffer_mapping;
-		VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_NO_OVERWRITE, 0, &buffer_mapping);
-		memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
-		VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
+		tm_scope_cycles("Write to gpu") {
+		    D3D11_MAPPED_SUBRESOURCE buffer_mapping;
+			tm_scope_cycles("The Map call") {
+				hr = VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping);
+			win32_check_hr(hr);
+			}
+			tm_scope_cycles("The memcpy") {
+				memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
+			}
+			tm_scope_cycles("The Unmap call") {
+				VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
+			}
+		}
 		
 		///
 		// Draw call
-		
-		u64 before_draw = os_get_current_cycle_count();
-		d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
-		u64 after_draw = os_get_current_cycle_count();
-		//log("Draw call took %llu cycles", after_draw-before_draw);
+		tm_scope_cycles("Draw call") d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
    }
-    
-    f64 rest_after  = os_get_current_time_in_seconds();
-    if (is_key_just_pressed('E')) 
-    	log("The rest took %.2fms", (rest_after-rest_before)*1000.0);
-    
-    f64 before_present = os_get_current_time_in_seconds();
-    hr = VTABLE(Present, d3d11_swap_chain, gfx._can_vsync && gfx.enable_vsync, 0);
-    f64 after = os_get_current_time_in_seconds();
-    if (is_key_just_pressed('E')) 
-    	log("Present took %.2fms", (after-before_present)*1000.0);
-	win32_check_hr(hr);
+
+	tm_scope_cycles("Present") {
+	    hr = VTABLE(Present, d3d11_swap_chain, window.enable_vsync, window.enable_vsync ? 0 : DXGI_PRESENT_ALLOW_TEARING);
+		win32_check_hr(hr);
+	}    
 	
 #if CONFIGURATION == DEBUG
 	///
--- a/oogabooga/gfx_interface.c
+++ b/oogabooga/gfx_interface.c
@ -15,18 +15,6 @@
 	#error "Unknown renderer GFX_RENDERER defined"
 #endif

-
-typedef struct Gfx_State {
-
-	// config
-	bool enable_vsync;
-
-	// readonly
-	bool _can_vsync;
-	
-} Gfx_State;
-Gfx_State gfx;
-
 forward_global const Gfx_Handle GFX_INVALID_HANDLE;

 typedef struct Gfx_Image {
--- a/oogabooga/linmath.c
+++ b/oogabooga/linmath.c
@ -133,7 +133,7 @@ inline float v3_dot_product(Vector3 a, Vector3 b) {
 	return simd_dot_product_float32_96((float*)&a, (float*)&b);
 }
 inline float v4_dot_product(Vector4 a, Vector4 b) {
-	return simd_dot_product_float32_128((float*)&a, (float*)&b);
+	return simd_dot_product_float32_128_aligned((float*)&a, (float*)&b);
 }

 Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) {
--- a/oogabooga/memory.c
+++ b/oogabooga/memory.c
@ -24,7 +24,7 @@ void* initialization_allocator_proc(u64 size, void *p, Allocator_Message message
 			
 			if (init_memory_head >= ((u8*)init_memory_arena+INIT_MEMORY_SIZE)) {
 				os_write_string_to_stdout(STR("Out of initialization memory! Please provide more by increasing INIT_MEMORY_SIZE"));
-				os_break();
+				crash();
 			}
 			return p;
 			break;
--- a/oogabooga/oogabooga.c
+++ b/oogabooga/oogabooga.c
@ -145,23 +145,22 @@ typedef u8 bool;
    #warning "Compiler is not explicitly supported, some things will probably not work as expected"
 #endif

-#include "cpu.c"
-
-
 #define DEBUG 0
 #define VERY_DEBUG 1
 #define RELEASE 2

-#if !defined(CONFIGURATION)
-
-	#if defined(NDEBUG)
-		#define CONFIGURATION RELEASE
-	#else
-		#define CONFIGURATION DEBUG
-	#endif
-
+#if defined(NDEBUG)
+	#define CONFIGURATION RELEASE
+#else
+	#define CONFIGURATION DEBUG
 #endif

+
+#include "cpu.c"
+
+
+
+
 #ifndef ENTRY_PROC
 	#define ENTRY_PROC entry
 #endif
--- a/oogabooga/os_impl_windows.c
+++ b/oogabooga/os_impl_windows.c
@ -135,6 +135,8 @@ LRESULT CALLBACK win32_window_proc(HWND passed_window, UINT message, WPARAM wpar

 void os_init(u64 program_memory_size) {
 	
+	memset(&window, 0, sizeof(window));
+	
 	timeBeginPeriod(1);
 #if CONFIGURATION == RELEASE
 	SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
--- a/oogabooga/os_interface.c
+++ b/oogabooga/os_interface.c
@ -61,36 +61,6 @@ inline int crt_vprintf(const char* fmt, va_list args) {
 	return os.crt_vprintf(fmt, args);
 }

-#if !defined(COMPILER_HAS_MEMCPY_INTRINSICS) || CONFIGURATION == DEBUG
-	inline void* naive_memcpy(void* dest, const void* source, size_t size) {
-		for (u64 i = 0; i < (u64)size; i++) ((u8*)dest)[i] = ((u8*)source)[i];
-		return dest;
-	}
-	inline void* memcpy(void* dest, const void* source, size_t size) {
-		if (!os.crt_memcpy) return naive_memcpy(dest, source, size);
-		return os.crt_memcpy(dest, source, size);
-	}
-	inline int naive_memcmp(const void* a, const void* b, size_t amount) {
-		// I don't understand the return value of memcmp but I also dont care
-		for (u64 i = 0; i < (u64)amount; i++) {
-			if (((u8*)a)[i] != ((u8*)b)[i])  return -1;
-		}
-		return 0;
-	}
-	inline int memcmp(const void* a, const void* b, size_t amount) {
-		if (!os.crt_memcmp)  return naive_memcmp(a, b, amount);
-		return os.crt_memcmp(a, b, amount);
-	}
-	inline void* naive_memset(void* dest, int value, size_t amount) {
-		for (u64 i = 0; i < (u64)amount; i++) ((u8*)dest)[i] = (u8)value;
-		return dest;
-	}
-	inline void* memset(void* dest, int value, size_t amount) {
-		if (!os.crt_memset)  return naive_memset(dest, value, amount);
-		return os.crt_memset(dest, value, amount);
-	}
-#endif
-
 inline bool bytes_match(void *a, void *b, u64 count) { return memcmp(a, b, count) == 0; }

 inline int vsnprintf(char* buffer, size_t n, const char* fmt, va_list args) {
@ -333,6 +303,7 @@ typedef struct Os_Window {
 	u32 x;
 	u32 y;
 	Vector4 clear_color;
+	bool enable_vsync;
 	
 	bool should_close;
 	
--- a/oogabooga/simd.c
+++ b/oogabooga/simd.c
@ -30,6 +30,16 @@ inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result);
 inline float basic_dot_product_float32_64(float *a, float *b);
 inline float basic_dot_product_float32_96(float *a, float *b);
 inline float basic_dot_product_float32_128(float *a, float *b);
+inline void basic_sqrt_float32_64(float *a, float *result);
+inline void basic_sqrt_float32_96(float *a, float *result);
+inline void basic_sqrt_float32_128(float *a, float *result);
+inline void basic_sqrt_float32_256(float *a, float *result);
+inline void basic_sqrt_float32_512(float *a, float *result);
+inline void basic_rsqrt_float32_64(float *a, float *result);
+inline void basic_rsqrt_float32_96(float *a, float *result);
+inline void basic_rsqrt_float32_128(float *a, float *result);
+inline void basic_rsqrt_float32_256(float *a, float *result);
+inline void basic_rsqrt_float32_512(float *a, float *result);



@ -123,6 +133,52 @@ inline void simd_div_float32_128_aligned(float *a, float *b, float* result) {
    __m128 vr = _mm_div_ps(va, vb);
    _mm_store_ps(result, vr);
 }
+inline void simd_sqrt_float32_96(float *a, float *result) {
+    __m128 va = _mm_loadu_ps(a);
+    va = _mm_and_ps(va, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));  // Mask last element
+    __m128 vr = _mm_sqrt_ps(va);
+    _mm_storeu_ps(result, vr);
+}
+
+inline void simd_rsqrt_float32_96(float *a, float *result) {
+    __m128 va = _mm_loadu_ps(a);
+    va = _mm_and_ps(va, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));  // Mask last element
+    __m128 vr = _mm_rsqrt_ps(va);
+    _mm_storeu_ps(result, vr);
+}
+inline void simd_sqrt_float32_64(float *a, float *result) {
+    __m128 va = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a);
+    __m128 vr = _mm_sqrt_ps(va);
+    _mm_storel_pi((__m64*)result, vr);
+}
+
+inline void simd_rsqrt_float32_64(float *a, float *result) {
+    __m128 va = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a);
+    __m128 vr = _mm_rsqrt_ps(va);
+    _mm_storel_pi((__m64*)result, vr);
+}
+inline void simd_sqrt_float32_128(float *a, float *result) {
+    __m128 va = _mm_loadu_ps(a);
+    __m128 vr = _mm_sqrt_ps(va);
+    _mm_storeu_ps(result, vr);
+}
+
+inline void simd_rsqrt_float32_128(float *a, float *result) {
+    __m128 va = _mm_loadu_ps(a);
+    __m128 vr = _mm_rsqrt_ps(va);
+    _mm_storeu_ps(result, vr);
+}
+inline void simd_sqrt_float32_128_aligned(float *a, float *result) {
+    __m128 va = _mm_load_ps(a);
+    __m128 vr = _mm_sqrt_ps(va);
+    _mm_store_ps(result, vr);
+}
+
+inline void simd_rsqrt_float32_128_aligned(float *a, float *result) {
+    __m128 va = _mm_load_ps(a);
+    __m128 vr = _mm_rsqrt_ps(va);
+    _mm_store_ps(result, vr);
+}


 #if SIMD_ENABLE_SSE2
@ -191,14 +247,6 @@ inline float simd_dot_product_float32_96(float *a, float *b) {
    __m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
    return _mm_cvtss_f32(dot_product);
 }
-inline float simd_dot_product_float32_96_aligned(float *a, float *b) {
-    __m128 vec1 = _mm_load_ps(a);
-    __m128 vec2 = _mm_load_ps(b);
-    vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
-    vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
-    __m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
-    return _mm_cvtss_f32(dot_product);
-}
 inline float simd_dot_product_float32_128(float *a, float *b) {
    __m128 vec1 = _mm_loadu_ps(a);
    __m128 vec2 = _mm_loadu_ps(b);
@ -217,8 +265,6 @@ inline float simd_dot_product_float32_128_aligned(float *a, float *b) {
 	#define simd_dot_product_float32_64 basic_dot_product_float32_64
 	#define simd_dot_product_float32_96 basic_dot_product_float32_96
 	#define simd_dot_product_float32_128 basic_dot_product_float32_128
-	#define simd_dot_product_float32_64_aligned basic_dot_product_float32_64
-	#define simd_dot_product_float32_96_aligned basic_dot_product_float32_96
 	#define simd_dot_product_float32_128_aligned basic_dot_product_float32_128
 #endif // SIMD_ENABLE_SSE41

@ -275,16 +321,41 @@ inline void simd_div_float32_256_aligned(float32 *a, float32 *b, float32* result
    __m256 vr = _mm256_div_ps(va, vb);
    _mm256_store_ps(result, vr);
 }
+inline void simd_sqrt_float32_256(float *a, float *result) {
+    __m256 va = _mm256_loadu_ps(a);
+    __m256 vr = _mm256_sqrt_ps(va);
+    _mm256_storeu_ps(result, vr);
+}
+
+inline void simd_rsqrt_float32_256(float *a, float *result) {
+    __m256 va = _mm256_loadu_ps(a);
+    __m256 vr = _mm256_rsqrt_ps(va);
+    _mm256_storeu_ps(result, vr);
+}
+inline void simd_sqrt_float32_256_aligned(float *a, float *result) {
+    __m256 va = _mm256_load_ps(a);
+    __m256 vr = _mm256_sqrt_ps(va);
+    _mm256_store_ps(result, vr);
+}
+
+inline void simd_rsqrt_float32_256_aligned(float *a, float *result) {
+    __m256 va = _mm256_load_ps(a);
+    __m256 vr = _mm256_rsqrt_ps(va);
+    _mm256_store_ps(result, vr);
+}
 #else
 	#define simd_add_float32_256 	basic_add_float32_256
 	#define simd_sub_float32_256 	basic_sub_float32_256
 	#define simd_mul_float32_256 	basic_mul_float32_256
 	#define simd_div_float32_256 	basic_div_float32_256
-	
+	#define simd_sqrt_float32_256   		basic_sqrt_float32_256
+	#define simd_rsqrt_float32_256  		basic_rsqrt_float32_256
 	#define simd_add_float32_256_aligned 	basic_add_float32_256
 	#define simd_sub_float32_256_aligned 	basic_sub_float32_256
 	#define simd_mul_float32_256_aligned 	basic_mul_float32_256
 	#define simd_div_float32_256_aligned 	basic_div_float32_256
+	#define simd_sqrt_float32_256_aligned   basic_sqrt_float32_256
+	#define simd_rsqrt_float32_256_aligned  basic_rsqrt_float32_256
 #endif

 #if SIMD_ENABLE_AVX2
@ -332,7 +403,6 @@ inline void simd_mul_int32_256_aligned(s32 *a, s32 *b, s32* result) {
 	#define simd_add_int32_256 		basic_add_int32_256
 	#define simd_sub_int32_256 		basic_sub_int32_256
 	#define simd_mul_int32_256 		basic_mul_int32_256
-	
 	#define simd_add_int32_256_aligned 		basic_add_int32_256
 	#define simd_sub_int32_256_aligned 		basic_sub_int32_256
 	#define simd_mul_int32_256_aligned 		basic_mul_int32_256
@ -432,6 +502,28 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
    __m512i vr = _mm512_mullo_epi32(va, vb);
    _mm512_store_si512((__m512i*)result, vr);
 }
+inline void simd_sqrt_float32_512(float *a, float *result) {
+    __m512 va = _mm512_loadu_ps(a);
+    __m512 vr = _mm512_sqrt_ps(va);
+    _mm512_storeu_ps(result, vr);
+}
+
+inline void simd_rsqrt_float32_512(float *a, float *result) {
+    __m512 va = _mm512_loadu_ps(a);
+    __m512 vr = _mm512_rsqrt14_ps(va);  // AVX-512 does not have _mm512_rsqrt_ps
+    _mm512_storeu_ps(result, vr);
+}
+inline void simd_sqrt_float32_512_aligned(float *a, float *result) {
+    __m512 va = _mm512_load_ps(a);
+    __m512 vr = _mm512_sqrt_ps(va);
+    _mm512_store_ps(result, vr);
+}
+
+inline void simd_rsqrt_float32_512_aligned(float *a, float *result) {
+    __m512 va = _mm512_load_ps(a);
+    __m512 vr = _mm512_rsqrt14_ps(va);
+    _mm512_store_ps(result, vr);
+}
 #else 
 	#define simd_add_float32_512 	basic_add_float32_512
 	#define simd_sub_float32_512 	basic_sub_float32_512
@ -440,7 +532,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
 	#define simd_add_int32_512 		basic_add_int32_512
 	#define simd_sub_int32_512 		basic_sub_int32_512
 	#define simd_mul_int32_512 		basic_mul_int32_512
-	
+	#define simd_sqrt_float32_512   basic_sqrt_float32_512
+	#define simd_rsqrt_float32_512  basic_rsqrt_float32_512
 	#define simd_add_float32_512_aligned 	basic_add_float32_512
 	#define simd_sub_float32_512_aligned 	basic_sub_float32_512
 	#define simd_mul_float32_512_aligned 	basic_mul_float32_512
@ -448,6 +541,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
 	#define simd_add_int32_512_aligned 		basic_add_int32_512
 	#define simd_sub_int32_512_aligned 		basic_sub_int32_512
 	#define simd_mul_int32_512_aligned 		basic_mul_int32_512
+	#define simd_sqrt_float32_512_aligned   basic_sqrt_float32_512
+	#define simd_rsqrt_float32_512_aligned  basic_rsqrt_float32_512
 #endif // SIMD_ENABLE_AVX512

 #else
@ -461,10 +556,16 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
 #define simd_mul_float32_128 	basic_mul_float32_128
 #define simd_div_float32_64 	basic_div_float32_64
 #define simd_div_float32_128 	basic_div_float32_128
+#define simd_sqrt_float32_64   	basic_sqrt_float32_64
+#define simd_sqrt_float32_128   basic_sqrt_float32_128
+#define simd_rsqrt_float32_64   basic_rsqrt_float32_64
+#define simd_rsqrt_float32_128  basic_rsqrt_float32_128
 #define simd_add_float32_128_aligned 	basic_add_float32_128
 #define simd_sub_float32_128_aligned 	basic_sub_float32_128
 #define simd_mul_float32_128_aligned 	basic_mul_float32_128
 #define simd_div_float32_128_aligned 	basic_div_float32_128
+#define simd_sqrt_float32_128_aligned   basic_sqrt_float32_128
+#define simd_rsqrt_float32_128_aligned  basic_rsqrt_float32_128

 // SSE2
 #define simd_add_int32_128 		basic_add_int32_128
@ -475,19 +576,26 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
 #define simd_mul_int32_128_aligned 		basic_mul_int32_128

 // SSE41
+#define simd_mul_int32_128 		basic_mul_int32_128
+#define simd_mul_int32_128_aligned 		basic_mul_int32_128
 #define simd_dot_product_float32_64 basic_dot_product_float32_64
 #define simd_dot_product_float32_96 basic_dot_product_float32_96
 #define simd_dot_product_float32_128 basic_dot_product_float32_128
+#define simd_dot_product_float32_128_aligned basic_dot_product_float32_128

 // AVX
 #define simd_add_float32_256 	basic_add_float32_256
 #define simd_sub_float32_256 	basic_sub_float32_256
 #define simd_mul_float32_256 	basic_mul_float32_256
 #define simd_div_float32_256 	basic_div_float32_256
+#define simd_sqrt_float32_256   		basic_sqrt_float32_256
+#define simd_rsqrt_float32_256  		basic_rsqrt_float32_256
 #define simd_add_float32_256_aligned 	basic_add_float32_256
 #define simd_sub_float32_256_aligned 	basic_sub_float32_256
 #define simd_mul_float32_256_aligned 	basic_mul_float32_256
 #define simd_div_float32_256_aligned 	basic_div_float32_256
+#define simd_sqrt_float32_256_aligned   basic_sqrt_float32_256
+#define simd_rsqrt_float32_256_aligned  basic_rsqrt_float32_256

 // AVX2
 #define simd_add_int32_256 		basic_add_int32_256
@ -505,6 +613,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
 #define simd_add_int32_512 		basic_add_int32_512
 #define simd_sub_int32_512 		basic_sub_int32_512
 #define simd_mul_int32_512 		basic_mul_int32_512
+#define simd_sqrt_float32_512   basic_sqrt_float32_512
+#define simd_rsqrt_float32_512  basic_rsqrt_float32_512
 #define simd_add_float32_512_aligned 	basic_add_float32_512
 #define simd_sub_float32_512_aligned 	basic_sub_float32_512
 #define simd_mul_float32_512_aligned 	basic_mul_float32_512
@ -512,9 +622,14 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
 #define simd_add_int32_512_aligned 		basic_add_int32_512
 #define simd_sub_int32_512_aligned 		basic_sub_int32_512
 #define simd_mul_int32_512_aligned 		basic_mul_int32_512
+#define simd_sqrt_float32_512_aligned   basic_sqrt_float32_512
+#define simd_rsqrt_float32_512_aligned  basic_rsqrt_float32_512

 #endif

+double __cdecl sqrt(_In_ double _X);
+double __cdecl rsqrt(_In_ double _X);
+
 inline void basic_add_float32_64 (float32 *a, float32 *b, float32* result) {
 	result[0] = a[0] + b[0];
 	result[1] = a[1] + b[1];
@ -638,6 +753,55 @@ inline float basic_dot_product_float32_96(float *a, float *b) {
 inline float basic_dot_product_float32_128(float *a, float *b) {
    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 }
+inline void basic_sqrt_float32_64(float *a, float *result) {
+    result[0] = sqrt(a[0]);
+    result[1] = sqrt(a[1]);
+}
+inline void basic_sqrt_float32_96(float *a, float *result) {
+    result[0] = sqrt(a[0]);
+    result[1] = sqrt(a[1]);
+    result[2] = sqrt(a[2]);
+}
+inline void basic_sqrt_float32_128(float *a, float *result) {
+    result[0] = sqrt(a[0]);
+    result[1] = sqrt(a[1]);
+    result[2] = sqrt(a[2]);
+    result[3] = sqrt(a[3]);
+}
+inline void basic_sqrt_float32_256(float *a, float *result) {
+    basic_sqrt_float32_128(a, result);
+    basic_sqrt_float32_128(a+4, result+4);
+}
+inline void basic_sqrt_float32_512(float *a, float *result) {
+    basic_sqrt_float32_256(a, result);
+    basic_sqrt_float32_256(a+8, result+8);
+}
+inline void basic_rsqrt_float32_64(float *a, float *result) {
+    result[0] = rsqrt(a[0]);
+    result[1] = rsqrt(a[1]);
+}
+inline void basic_rsqrt_float32_96(float *a, float *result) {
+    result[0] = rsqrt(a[0]);
+    result[1] = rsqrt(a[1]);
+    result[2] = rsqrt(a[2]);
+}
+inline void basic_rsqrt_float32_128(float *a, float *result) {
+    result[0] = rsqrt(a[0]);
+    result[1] = rsqrt(a[1]);
+    result[2] = rsqrt(a[2]);
+    result[3] = rsqrt(a[3]);
+}
+inline void basic_rsqrt_float32_256(float *a, float *result) {
+    basic_rsqrt_float32_128(a, result);
+    basic_rsqrt_float32_128(a+4, result+4);
+}
+inline void basic_rsqrt_float32_512(float *a, float *result) {
+    basic_rsqrt_float32_256(a, result);
+    basic_rsqrt_float32_256(a+8, result+8);
+}
+
+
+



--- a/oogabooga/string.c
+++ b/oogabooga/string.c
@ -5,7 +5,6 @@
 	
 */

-void * memcpy (void *,const void *,size_t);
 void* talloc(u64);

 typedef struct string {
@ -13,6 +12,7 @@ typedef struct string {
 	u8 *data;
 } string;

+#define fixed_string STR
 #define STR(s) ((string){ length_of_null_terminated_string((const char*)s), (u8*)s })

 inline u64 length_of_null_terminated_string(const char* cstring) {
--- a/oogabooga/string_format.c
+++ b/oogabooga/string_format.c
@ -213,7 +213,7 @@ void printf(const char* fmt, ...) {


 typedef void(*Logger_Proc)(Log_Level level, string s);
-#define LOG_BASE(level, ...) If context.logger then ((Logger_Proc)context.logger)(level, tprint(__VA_ARGS__))
+#define LOG_BASE(level, ...) if (context.logger) ((Logger_Proc)context.logger)(level, tprint(__VA_ARGS__))


 #define log_verbose(...) LOG_BASE(LOG_VERBOSE, __VA_ARGS__)
--- a/oogabooga/third_party.c
+++ b/oogabooga/third_party.c
@ -1,6 +1,6 @@
 // Custom allocators for lodepng
 Allocator get_heap_allocator();
-Allocator lodepng_allocator = {0};
+/*Allocator lodepng_allocator = {0};
 void* lodepng_malloc(size_t size) {
 #ifdef LODEPNG_MAX_ALLOC
 	  if(size > LODEPNG_MAX_ALLOC) return 0;
@ -28,4 +28,47 @@ void lodepng_free(void* ptr) {
 #define LODEPNG_NO_COMPILE_ENCODER
 // One day I might write my own png decoder so we don't even need this
 #include "third_party/lodepng.h" 
-#include "third_party/lodepng.c"
+#include "third_party/lodepng.c"*/
+
+#define STB_TRUETYPE_IMPLEMENTATION
+#define STB_IMAGE_IMPLEMENTATION
+
+typedef unsigned char   u8;
+typedef signed   char   s8;
+typedef unsigned short  u16;
+typedef signed   short  s16;
+typedef unsigned int    u32;
+typedef signed   int    s32;
+
+void *stbtt_malloc(size_t size) {
+	if (!size) return 0;
+	return alloc(get_heap_allocator(), size);
+}
+#define STBTT_malloc(x,u) ((void)(u),stbtt_malloc(x))
+void stbtt_free(void *p) {
+	if (!p) return;
+	dealloc(get_heap_allocator(), p);
+}
+#define STBTT_free(x,u)    ((void)(u),stbtt_free(x))
+
+#define STBTT_assert(x)    assert(x)
+
+size_t stbtt_strlen(const char* str) {
+	size_t count = 0;
+	while (str[count] != 0) count += 1;
+	return count;
+}
+#define STBTT_strlen(x) stbtt_strlen(x)
+#define STBTT_memcpy memcpy
+#define STBTT_memset memset
+
+
+#define STBI_NO_STDIO
+#define STBI_ASSERT(x) {if (!(x)) *(volatile char*)0 = 0;}
+
+#define STBI_MALLOC(sz)           stbtt_malloc(sz)
+#define STBI_REALLOC(p,newsz)     get_heap_allocator().proc(newsz, p, ALLOCATOR_REALLOCATE, 0)
+#define STBI_FREE(p)              stbtt_free(p)
+
+#include "third_party/stb_image.h"
+#include "third_party/stb_truetype.h"
--- a/oogabooga/third_party/stb_image.h
+++ b/oogabooga/third_party/stb_image.h
--- a/oogabooga/third_party/stb_truetype.h
+++ b/oogabooga/third_party/stb_truetype.h