From f9788b2e7472981046de4390544e7c89ee59dd17 Mon Sep 17 00:00:00 2001
From: Charlie Malmqvist <charlie.malmqvist1@gmail.com>
Date: Sun, 28 Jul 2024 15:08:36 +0200
Subject: [PATCH] Some memory rework, concurrency improvements, cleanups

---
 TODO                        |  11 +-
 build.c                     |   6 +-
 changelog.txt               |  11 +-
 oogabooga/concurrency.c     |  30 +---
 oogabooga/cpu.c             |  22 +--
 oogabooga/gfx_impl_d3d11.c  | 105 ++++++-----
 oogabooga/memory.c          |  64 ++-----
 oogabooga/oogabooga.c       |   2 +-
 oogabooga/os_impl_windows.c | 339 ++++++++++++++++++++++--------------
 oogabooga/os_interface.c    |  44 ++++-
 oogabooga/profiling.c       |  12 +-
 oogabooga/tests.c           |   2 +-
 12 files changed, 365 insertions(+), 283 deletions(-)

diff --git a/TODO b/TODO
index 5995d2f..4c6624f 100644
--- a/TODO
+++ b/TODO
@@ -21,7 +21,6 @@
 		- 24-Bit audio conversion doesn't really work
 			
 - General bugs & issues
-	- Release freeze in run_tests
 	- Window width&height is zero when minimized (and we make a 0x0 swap chain)
 	- Window positioning & sizing is fucky wucky
 	- Memory error messages are misleading when no VERY_DEBUG
@@ -43,13 +42,19 @@
 	- Mouse pointer
 	   - Hide mouse pointer
 	
-	
-- Arenas
+- Memory
+	- In heap allocator, mark pages that fit entirely into free nodes as NOACCESS
+	- Arenas
 
 - Examples/Guides:
     - Scaling text for pixel perfect rendering
     - Z sorting
     - Scissor boxing
+    - Concurrency
+	
+- Rework profiler
+	- Store records and convert to google trace format on exit
+	- Measure both time and cycles, output a google_trace_cycles.json & google_trace_time.json
 	
 - Needs testing:
 	- Audio format channel conversions
diff --git a/build.c b/build.c
index fbb44aa..aa12dd3 100644
--- a/build.c
+++ b/build.c
@@ -3,7 +3,7 @@
 ///
 // Build config stuff
 
-#define INITIAL_PROGRAM_MEMORY_SIZE MB(5)
+#define INITIAL_PROGRAM_MEMORY_SIZE MB(8)
 
 // You might want to increase this if you get a log warning saying the temporary storage was overflown.
 // In many cases, overflowing the temporary storage should be fine since it just wraps back around and
@@ -37,11 +37,11 @@ typedef struct Context_Extra {
 
 // #include "oogabooga/examples/text_rendering.c"
 // #include "oogabooga/examples/custom_logger.c"
-// #include "oogabooga/examples/renderer_stress_test.c"
+#include "oogabooga/examples/renderer_stress_test.c"
 // #include "oogabooga/examples/tile_game.c"
 // #include "oogabooga/examples/audio_test.c"
 // #include "oogabooga/examples/custom_shader.c"
-#include "oogabooga/examples/growing_array_example.c"
+// #include "oogabooga/examples/growing_array_example.c"
 
 // This is where you swap in your own project!
 // #include "entry_yourepicgamename.c"
diff --git a/changelog.txt b/changelog.txt
index f0a8ee5..4bffd38 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,5 +1,5 @@
 
-## v0.01.003 - Nothing, really
+## v0.01.003 - Stuff
 	- Os layer
 		- Implemented setting of mouse pointers, either to system standard pointers or a custom image
 		- Ignore SETCURSOR events unless window resize
@@ -9,10 +9,19 @@
 	- Renderer
 		- Fix bad uv sampling bug when uneven window dimensions
 	
+	- Memory
+		- Made program_memory act more like an arena (see os_reserve_next_memory_pages() & os_unlock_program_memory_pages())
+		- In debug, default program memory to PAGE_NOACCESS which needs to be unlocked with os_unlock_program_memory_pages() (better crashes if we touch that memory)
+		- 
+	
 	- Misc
 		- Deprecate Rangef stuff
 		- peek_random()
 		- Update #Contributions
+		- Clean up memory barriers in concurrency.c and use volatile instead
+		- Output d3d11 debug messages before crash on hr fail
+		- Configurable temporary storage size for new threads
+		- Cleanup temporary storage after thread destroy
 
 
 ## v0.01.002 - Flexible build options, Hotloading, growing array
diff --git a/oogabooga/concurrency.c b/oogabooga/concurrency.c
index 4f2c3ea..f0a7f94 100644
--- a/oogabooga/concurrency.c
+++ b/oogabooga/concurrency.c
@@ -4,18 +4,18 @@ typedef struct Mutex Mutex;
 typedef struct Binary_Semaphore Binary_Semaphore;
 
 // These are probably your best friend for sync-free multi-processing.
-inline bool compare_and_swap_8(uint8_t *a, uint8_t b, uint8_t old);
-inline bool compare_and_swap_16(uint16_t *a, uint16_t b, uint16_t old);
-inline bool compare_and_swap_32(uint32_t *a, uint32_t b, uint32_t old);
-inline bool compare_and_swap_64(uint64_t *a, uint64_t b, uint64_t old);
-inline bool compare_and_swap_bool(bool *a, bool b, bool old);
+inline bool compare_and_swap_8(volatile uint8_t *a, uint8_t b, uint8_t old);
+inline bool compare_and_swap_16(volatile uint16_t *a, uint16_t b, uint16_t old);
+inline bool compare_and_swap_32(volatile uint32_t *a, uint32_t b, uint32_t old);
+inline bool compare_and_swap_64(volatile uint64_t *a, uint64_t b, uint64_t old);
+inline bool compare_and_swap_bool(volatile bool *a, bool b, bool old);
 
 ///
 // Spinlock "primitive"
 // Like a mutex but it eats up the entire core while waiting.
 // Beneficial if contention is low or sync speed is important
 typedef struct Spinlock {
-	bool locked;
+	volatile bool locked;
 } Spinlock;
 
 void ogb_instance
@@ -61,7 +61,7 @@ mutex_release(Mutex *m);
 ///
 // Binary semaphore
 typedef struct Binary_Semaphore {
-    bool signaled;
+    volatile bool signaled;
     Mutex mutex;
 } Binary_Semaphore;
 
@@ -86,16 +86,12 @@ void spinlock_init(Spinlock *l) {
 void spinlock_acquire_or_wait(Spinlock* l) {
 	while (true) {
         bool expected = false;
-        MEMORY_BARRIER;
         if (compare_and_swap_bool(&l->locked, true, expected)) {
-        	MEMORY_BARRIER;
             return;
         }
         while (l->locked) {
             // spinny boi
-            MEMORY_BARRIER;
         }
-        MEMORY_BARRIER;
     }
 }
 // Returns true on aquired, false if timeout seconds reached
@@ -103,24 +99,19 @@ bool spinlock_acquire_or_wait_timeout(Spinlock* l, f64 timeout_seconds) {
     f64 start = os_get_current_time_in_seconds();
 	while (true) {
         bool expected = false;
-        MEMORY_BARRIER;
         if (compare_and_swap_bool(&l->locked, true, expected)) {
-        	MEMORY_BARRIER;
             return true;
         }
         while (l->locked) {
             // spinny boi
             if ((os_get_current_time_in_seconds()-start) >= timeout_seconds) return false;
-            MEMORY_BARRIER;
         }
     }
     return true;
 }
 void spinlock_release(Spinlock* l) {
 	bool expected = true;
-    MEMORY_BARRIER;
     bool success = compare_and_swap_bool(&l->locked, false, expected);
-    MEMORY_BARRIER;
     assert(success, "This thread should have acquired the spinlock but compare_and_swap failed");
 }
 
@@ -142,29 +133,22 @@ void mutex_acquire_or_wait(Mutex *m) {
 	if (spinlock_acquire_or_wait_timeout(&m->spinlock, m->spin_time_microseconds / 1000000.0)) {
         assert(!m->spinlock_acquired, "Internal sync error in Mutex");
     	m->spinlock_acquired = true;
-    	MEMORY_BARRIER;
     }
     os_lock_mutex(m->os_handle);
     
     assert(!m->acquiring_thread, "Internal sync error in Mutex: Multiple threads acquired");
     m->acquiring_thread = context.thread_id;
-    MEMORY_BARRIER;
 }
 void mutex_release(Mutex *m) {
 	assert(m->acquiring_thread != 0, "Tried to release a mutex which is not acquired");
 	assert(m->acquiring_thread == context.thread_id, "Non-owning thread tried to release mutex");
 	m->acquiring_thread = 0;
-	MEMORY_BARRIER;
 	bool was_spinlock_acquired = m->spinlock_acquired;
 	m->spinlock_acquired = false;
-	MEMORY_BARRIER;
 	os_unlock_mutex(m->os_handle);
-	MEMORY_BARRIER;
 	if (was_spinlock_acquired) {
 		spinlock_release(&m->spinlock);
-		MEMORY_BARRIER;
 	}
-	MEMORY_BARRIER;
 }
 
 
diff --git a/oogabooga/cpu.c b/oogabooga/cpu.c
index 1c0a0cf..d901910 100644
--- a/oogabooga/cpu.c
+++ b/oogabooga/cpu.c
@@ -81,27 +81,27 @@ typedef struct Cpu_Capabilities {
 	#pragma intrinsic(_InterlockedCompareExchange64)
 	
 	inline bool 
-	compare_and_swap_8(uint8_t *a, uint8_t b, uint8_t old) {
+	compare_and_swap_8(volatile uint8_t *a, uint8_t b, uint8_t old) {
 	    return _InterlockedCompareExchange8((volatile char*)a, (char)b, (char)old) == old;
 	}
 	
 	inline bool 
-	compare_and_swap_16(uint16_t *a, uint16_t b, uint16_t old) {
+	compare_and_swap_16(volatile uint16_t *a, uint16_t b, uint16_t old) {
 	    return _InterlockedCompareExchange16((volatile short*)a, (short)b, (short)old) == old;
 	}
 	
 	inline bool 
-	compare_and_swap_32(uint32_t *a, uint32_t b, uint32_t old) {
+	compare_and_swap_32(volatile uint32_t *a, uint32_t b, uint32_t old) {
 	    return _InterlockedCompareExchange((volatile long*)a, (long)b, (long)old) == old;
 	}
 	
 	inline bool 
-	compare_and_swap_64(uint64_t *a, uint64_t b, uint64_t old) {
+	compare_and_swap_64(volatile uint64_t *a, uint64_t b, uint64_t old) {
 	    return _InterlockedCompareExchange64((volatile long long*)a, (long long)b, (long long)old) == old;
 	}
 	
 	inline bool 
-	compare_and_swap_bool(bool *a, bool b, bool old) {
+	compare_and_swap_bool(volatile bool *a, bool b, bool old) {
 	    return compare_and_swap_8((uint8_t*)a, (uint8_t)b, (uint8_t)old);
 	}
 	
@@ -173,7 +173,7 @@ typedef struct Cpu_Capabilities {
 	#define DEPRECATED(proc, msg) __attribute__((deprecated(msg))) proc 
 	
 	inline bool 
-	compare_and_swap_8(uint8_t *a, uint8_t b, uint8_t old) {
+	compare_and_swap_8(volatile uint8_t *a, uint8_t b, uint8_t old) {
 	    unsigned char result;
 	    __asm__ __volatile__(
 	        "lock; cmpxchgb %2, %1"
@@ -185,7 +185,7 @@ typedef struct Cpu_Capabilities {
 	}
 	
 	inline bool 
-	compare_and_swap_16(uint16_t *a, uint16_t b, uint16_t old) {
+	compare_and_swap_16(volatile uint16_t *a, uint16_t b, uint16_t old) {
 	    unsigned short result;
 	    __asm__ __volatile__(
 	        "lock; cmpxchgw %2, %1"
@@ -197,7 +197,7 @@ typedef struct Cpu_Capabilities {
 	}
 	
 	inline bool 
-	compare_and_swap_32(uint32_t *a, uint32_t b, uint32_t old) {
+	compare_and_swap_32(volatile uint32_t *a, uint32_t b, uint32_t old) {
 	    unsigned int result;
 	    __asm__ __volatile__(
 	        "lock; cmpxchgl %2, %1"
@@ -209,7 +209,7 @@ typedef struct Cpu_Capabilities {
 	}
 	
 	inline bool 
-	compare_and_swap_64(uint64_t *a, uint64_t b, uint64_t old) {
+	compare_and_swap_64(volatile uint64_t *a, uint64_t b, uint64_t old) {
 	    unsigned long long result;
 	    __asm__ __volatile__(
 	        "lock; cmpxchgq %2, %1"
@@ -221,11 +221,11 @@ typedef struct Cpu_Capabilities {
 	}
 	
 	inline bool 
-	compare_and_swap_bool(bool *a, bool b, bool old) {
+	compare_and_swap_bool(volatile bool *a, bool b, bool old) {
 	    return compare_and_swap_8((uint8_t*)a, (uint8_t)b, (uint8_t)old);
 	}
 	
-	#define MEMORY_BARRIER __asm__ __volatile__("" ::: "memory")
+	#define MEMORY_BARRIER {__asm__ __volatile__("" ::: "memory");__sync_synchronize();}
 	
 	#define thread_local __thread
 	
diff --git a/oogabooga/gfx_impl_d3d11.c b/oogabooga/gfx_impl_d3d11.c
index 4514a9f..ab27362 100644
--- a/oogabooga/gfx_impl_d3d11.c
+++ b/oogabooga/gfx_impl_d3d11.c
@@ -26,6 +26,8 @@ typedef struct alignat(16) D3D11_Vertex {
 	
 } D3D11_Vertex;
 
+// #Global
+
 ID3D11Debug *d3d11_debug = 0;
 
 ID3D11Device *d3d11_device = 0;
@@ -61,9 +63,6 @@ u64 d3d11_cbuffer_size = 0;
 Draw_Quad *sort_quad_buffer = 0;
 u64 sort_quad_buffer_size = 0;
 
-// Defined at the bottom of this file
-extern const char *d3d11_image_shader_source;
-
 const char* d3d11_stringify_category(D3D11_MESSAGE_CATEGORY category) {
     switch (category) {
     case D3D11_MESSAGE_CATEGORY_APPLICATION_DEFINED: return "Application Defined";
@@ -90,7 +89,6 @@ const char* d3d11_stringify_severity(D3D11_MESSAGE_SEVERITY severity) {
     default: return "Unknown";
     }
 }
-
 void CALLBACK d3d11_debug_callback(D3D11_MESSAGE_CATEGORY category, D3D11_MESSAGE_SEVERITY severity, D3D11_MESSAGE_ID id, const char* description)
 {
 	if (id == 391) {
@@ -122,6 +120,40 @@ void CALLBACK d3d11_debug_callback(D3D11_MESSAGE_CATEGORY category, D3D11_MESSAG
 			break;
 	}
 }
+void
+d3d11_output_debug_messages() {
+	///
+	// Check debug messages, output to stdout
+	ID3D11InfoQueue* info_q = 0;
+	HRESULT hr = ID3D11Device_QueryInterface(d3d11_device, &IID_ID3D11InfoQueue, (void**)&info_q);
+	if (SUCCEEDED(hr)) {
+		u64 msg_count = ID3D11InfoQueue_GetNumStoredMessagesAllowedByRetrievalFilter(info_q);
+		for (u64 i = 0; i < msg_count; i++) {
+		    SIZE_T msg_size = 0;
+		    ID3D11InfoQueue_GetMessage(info_q, i, 0, &msg_size);
+		
+		    D3D11_MESSAGE* msg = (D3D11_MESSAGE*)talloc(msg_size);
+		    if (msg) {
+		        ID3D11InfoQueue_GetMessage(info_q, i, msg, &msg_size); // Get the actual message
+		        
+		        d3d11_debug_callback(msg->Category, msg->Severity, msg->ID, msg->pDescription);
+		    }
+		}
+	}
+}
+
+#define d3d11_check_hr(hr) d3d11_check_hr_impl(hr, __LINE__, __FILE__);
+void 
+d3d11_check_hr_impl(HRESULT hr, u32 line, const char* file_name) {
+	if (!SUCCEEDED(hr)) d3d11_output_debug_messages();
+    win32_check_hr_impl(hr, line, file_name);
+}
+
+// Defined at the bottom of this file
+// #Global
+extern const char *d3d11_image_shader_source;
+
+
 
 void d3d11_update_swapchain() {
 
@@ -163,18 +195,18 @@ void d3d11_update_swapchain() {
 		// Obtain DXGI factory from device
 		IDXGIDevice *dxgi_device = 0;
 		hr = ID3D11Device_QueryInterface(d3d11_device, &IID_IDXGIDevice, cast(void**)&dxgi_device);
-		win32_check_hr(hr);
+		d3d11_check_hr(hr);
 		
 		IDXGIAdapter *adapter;
 		hr = IDXGIDevice_GetAdapter(dxgi_device, &adapter);
-		win32_check_hr(hr);
+		d3d11_check_hr(hr);
 		
 		IDXGIFactory2 *dxgi_factory;
 		hr = IDXGIAdapter_GetParent(adapter, &IID_IDXGIFactory2, cast(void**)&dxgi_factory); 
-		win32_check_hr(hr);
+		d3d11_check_hr(hr);
 	
 		hr = IDXGIFactory2_CreateSwapChainForHwnd(dxgi_factory, (IUnknown*)d3d11_device, window._os_handle, &scd, 0, 0, &d3d11_swap_chain); 	
-		win32_check_hr(hr);
+		d3d11_check_hr(hr);
 		
 		RECT client_rect;
 		bool ok = GetClientRect(window._os_handle, &client_rect);
@@ -185,7 +217,7 @@ void d3d11_update_swapchain() {
 		
 		// store the swap chain description, as created by CreateSwapChainForHwnd
 		hr = IDXGISwapChain1_GetDesc1(d3d11_swap_chain, &d3d11_swap_chain_desc);
-		win32_check_hr(hr);
+		d3d11_check_hr(hr);
 		
 		// disable alt enter
 		IDXGIFactory_MakeWindowAssociation(dxgi_factory, window._os_handle, cast (u32) DXGI_MWA_NO_ALT_ENTER); 
@@ -207,11 +239,11 @@ void d3d11_update_swapchain() {
 		u32 window_height = client_rect.bottom-client_rect.top;
 		
 		hr = IDXGISwapChain1_ResizeBuffers(d3d11_swap_chain, d3d11_swap_chain_desc.BufferCount, window_width, window_height, d3d11_swap_chain_desc.Format, d3d11_swap_chain_desc.Flags);
-		win32_check_hr(hr);
+		d3d11_check_hr(hr);
 		
 		// update swap chain description
 		hr = IDXGISwapChain1_GetDesc1(d3d11_swap_chain, &d3d11_swap_chain_desc);
-		win32_check_hr(hr);
+		d3d11_check_hr(hr);
 		
 		log("Resized swap chain from %dx%d to %dx%d", d3d11_swap_chain_width, d3d11_swap_chain_height, window_width, window_height);
 		
@@ -223,9 +255,9 @@ void d3d11_update_swapchain() {
 	
 	
 	hr = IDXGISwapChain1_GetBuffer(d3d11_swap_chain, 0, &IID_ID3D11Texture2D, (void**)&d3d11_back_buffer);
-	win32_check_hr(hr);
+	d3d11_check_hr(hr);
 	hr = ID3D11Device_CreateRenderTargetView(d3d11_device, (ID3D11Resource*)d3d11_back_buffer, 0, &d3d11_window_render_target_view); 
-	win32_check_hr(hr);
+	d3d11_check_hr(hr);
 }
 
 bool
@@ -266,10 +298,10 @@ d3d11_compile_shader(string source) {
     
     // Create the shaders
     hr = ID3D11Device_CreateVertexShader(d3d11_device, vs_buffer, vs_size, NULL, &d3d11_vertex_shader_for_2d);
-    win32_check_hr(hr);
+    d3d11_check_hr(hr);
 
     hr = ID3D11Device_CreatePixelShader(d3d11_device, ps_buffer, ps_size, NULL, &d3d11_fragment_shader_for_2d);
-    win32_check_hr(hr);
+    d3d11_check_hr(hr);
 
     log_verbose("Shaders created");
 
@@ -362,7 +394,7 @@ d3d11_compile_shader(string source) {
 	
 	
 	hr = ID3D11Device_CreateInputLayout(d3d11_device, layout, layout_base_count+VERTEX_2D_USER_DATA_COUNT, vs_buffer, vs_size, &d3d11_image_vertex_layout);
-	win32_check_hr(hr);
+	d3d11_check_hr(hr);
 	
 	#undef layout_base_count
 
@@ -427,7 +459,7 @@ void gfx_init() {
 		}
 	}
 	
-	win32_check_hr(hr);
+	d3d11_check_hr(hr);
 	
 	if (debug_failed) {
 		log_error("We could not init D3D11 with DEBUG flag. To fix this, you can try:\n1. Go to windows settings\n2. Go to System -> Optional features\n3. Add the feature called \"Graphics Tools\"\n4. Restart your computer\n5. Be frustrated that windows is like this.\nhttps://devblogs.microsoft.com/cppblog/visual-studio-2015-and-graphics-tools-for-windows-10/");
@@ -476,7 +508,7 @@ void gfx_init() {
 	    bd.RenderTarget[0].BlendOpAlpha = D3D11_BLEND_OP_ADD;
 	    bd.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL;
 	    hr = ID3D11Device_CreateBlendState(d3d11_device, &bd, &d3d11_blend_state);
-	    win32_check_hr(hr);
+	    d3d11_check_hr(hr);
 	    ID3D11DeviceContext_OMSetBlendState(d3d11_context, d3d11_blend_state, NULL, 0xffffffff);
 	}
 	
@@ -488,7 +520,7 @@ void gfx_init() {
 	    desc.DepthClipEnable = FALSE;
 	    desc.CullMode = D3D11_CULL_NONE;
 	    hr = ID3D11Device_CreateRasterizerState(d3d11_device, &desc, &d3d11_rasterizer);
-	    win32_check_hr(hr);
+	    d3d11_check_hr(hr);
 	    ID3D11DeviceContext_RSSetState(d3d11_context, d3d11_rasterizer);
 	}
 	
@@ -502,19 +534,19 @@ void gfx_init() {
 	    
 	    sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT;
 	    hr = ID3D11Device_CreateSamplerState(d3d11_device, &sd, &d3d11_image_sampler_np_fp);
-	    win32_check_hr(hr);
+	    d3d11_check_hr(hr);
 	    
 	    sd.Filter = D3D11_FILTER_MIN_MAG_MIP_LINEAR;
 	    hr =ID3D11Device_CreateSamplerState(d3d11_device, &sd, &d3d11_image_sampler_nl_fl);
-	    win32_check_hr(hr);
+	    d3d11_check_hr(hr);
 	    
 	    sd.Filter = D3D11_FILTER_MIN_LINEAR_MAG_MIP_POINT;
 	    hr = ID3D11Device_CreateSamplerState(d3d11_device, &sd, &d3d11_image_sampler_np_fl);
-	    win32_check_hr(hr);
+	    d3d11_check_hr(hr);
 	    
 	    sd.Filter = D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR;
 	    hr = ID3D11Device_CreateSamplerState(d3d11_device, &sd, &d3d11_image_sampler_nl_fp);
-	    win32_check_hr(hr);
+	    d3d11_check_hr(hr);
 	}
 	
 	string source = STR(d3d11_image_shader_source);
@@ -798,7 +830,7 @@ void d3d11_process_draw_frame() {
 		    D3D11_MAPPED_SUBRESOURCE buffer_mapping;
 			tm_scope("The Map call") {
 				hr = ID3D11DeviceContext_Map(d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping);
-			win32_check_hr(hr);
+			d3d11_check_hr(hr);
 			}
 			tm_scope("The memcpy") {
 				memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
@@ -841,24 +873,7 @@ void gfx_update() {
 	
 	
 #if CONFIGURATION == DEBUG
-	///
-	// Check debug messages, output to stdout
-	ID3D11InfoQueue* info_q = 0;
-	hr = ID3D11Device_QueryInterface(d3d11_device, &IID_ID3D11InfoQueue, (void**)&info_q);
-	if (SUCCEEDED(hr)) {
-		u64 msg_count = ID3D11InfoQueue_GetNumStoredMessagesAllowedByRetrievalFilter(info_q);
-		for (u64 i = 0; i < msg_count; i++) {
-		    SIZE_T msg_size = 0;
-		    ID3D11InfoQueue_GetMessage(info_q, i, 0, &msg_size);
-		
-		    D3D11_MESSAGE* msg = (D3D11_MESSAGE*)talloc(msg_size);
-		    if (msg) {
-		        ID3D11InfoQueue_GetMessage(info_q, i, msg, &msg_size); // Get the actual message
-		        
-		        d3d11_debug_callback(msg->Category, msg->Severity, msg->ID, msg->pDescription);
-		    }
-		}
-	}
+	d3d11_output_debug_messages();
 #endif
 	
 }
@@ -899,10 +914,10 @@ void gfx_init_image(Gfx_Image *image, void *initial_data) {
 	
 	ID3D11Texture2D* texture = 0;
 	HRESULT hr = ID3D11Device_CreateTexture2D(d3d11_device, &desc, &data_desc, &texture);
-	win32_check_hr(hr);
+	d3d11_check_hr(hr);
 	
 	hr = ID3D11Device_CreateShaderResourceView(d3d11_device, (ID3D11Resource*)texture, 0, &image->gfx_handle);
-	win32_check_hr(hr);
+	d3d11_check_hr(hr);
 	
 	if (!initial_data) {
 		dealloc(image->allocator, data);
@@ -972,7 +987,7 @@ shader_recompile_with_extension(string ext_source, u64 cbuffer_size) {
 	desc.BindFlags      = D3D11_BIND_CONSTANT_BUFFER;
 	desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
 	HRESULT hr = ID3D11Device_CreateBuffer(d3d11_device, &desc, null, &d3d11_cbuffer);
-	win32_check_hr(hr);
+	d3d11_check_hr(hr);
 	
 	d3d11_cbuffer_size = cbuffer_size;
 	
diff --git a/oogabooga/memory.c b/oogabooga/memory.c
index afe3a92..05beea0 100644
--- a/oogabooga/memory.c
+++ b/oogabooga/memory.c
@@ -4,18 +4,6 @@
 #define MB(x) ((KB(x))*1024ull)
 #define GB(x) ((MB(x))*1024ull)
 
-
-// #Global
-ogb_instance void *program_memory;
-ogb_instance u64 program_memory_size;
-ogb_instance Mutex_Handle program_memory_mutex;
-
-#if !OOGABOOGA_LINK_EXTERNAL_INSTANCE
-void *program_memory = 0;
-u64 program_memory_size = 0;
-Mutex_Handle program_memory_mutex = 0;
-#endif // NOT OOGABOOGA_LINK_EXTERNAL_INSTANCE
-
 #ifndef INIT_MEMORY_SIZE
 	#define INIT_MEMORY_SIZE KB(50)
 #endif
@@ -63,7 +51,7 @@ Allocator get_initialization_allocator() {
 // BUT: We aren't really supposed to allocate/deallocate directly on the heap too much anyways...
 
 #define MAX_HEAP_BLOCK_SIZE ((MB(500)+os.page_size)& ~(os.page_size-1))
-#define DEFAULT_HEAP_BLOCK_SIZE (min(MAX_HEAP_BLOCK_SIZE, program_memory_size))
+#define DEFAULT_HEAP_BLOCK_SIZE (min(MAX_HEAP_BLOCK_SIZE, program_memory_capacity))
 #define HEAP_ALIGNMENT (sizeof(Heap_Free_Node))
 typedef struct Heap_Free_Node Heap_Free_Node;
 typedef struct Heap_Block Heap_Block;
@@ -115,7 +103,7 @@ u64 get_heap_block_size_including_metadata(Heap_Block *block) {
 }
 
 bool is_pointer_in_program_memory(void *p) {
-	return (u8*)p >= (u8*)program_memory && (u8*)p<((u8*)program_memory+program_memory_size);
+	return (u8*)p >= (u8*)program_memory && (u8*)p<((u8*)program_memory+program_memory_capacity);
 }
 bool is_pointer_in_stack(void* p) {
     void* stack_base = os_get_stack_base();
@@ -229,33 +217,20 @@ Heap_Block *make_heap_block(Heap_Block *parent, u64 size) {
 
 	size += sizeof(Heap_Block);
 
-	size = (size) & ~(HEAP_ALIGNMENT-1);	
+	size = (size+os.page_size) & ~(os.page_size-1);	
 
-	Heap_Block *block;
-	if (parent) {
-		block = (Heap_Block*)(((u8*)parent)+get_heap_block_size_including_metadata(parent));
-		parent->next = block;
-	} else {
-		block = (Heap_Block*)program_memory;
-	}
+	Heap_Block *block = (Heap_Block*)os_reserve_next_memory_pages(size);
+		
+	assert((u64)block % os.page_size == 0, "Heap block not aligned to page size");
+	
+	if (parent) parent->next = block;
+	
+	os_unlock_program_memory_pages(block, size);
+	
 #if CONFIGURATION == DEBUG
 	block->total_allocated = 0;
 #endif
 	
-	
-	if (((u8*)block)+size >= ((u8*)program_memory)+program_memory_size) {
-		u64 minimum_size = ((u8*)block+size) - (u8*)program_memory + 1;
-		u64 new_program_size = get_next_power_of_two(minimum_size);
-		assert(new_program_size >= minimum_size, "Internal goof");
-		const u64 ATTEMPTS = 1000;
-		for (u64 i = 0; i <= ATTEMPTS; i++) {
-			if (program_memory_size >= new_program_size) break; // Another thread might have resized already, causing it to fail here.
-			assert(i < ATTEMPTS, "OS is not letting us allocate more memory. Maybe we are out of memory? You sure must be using a lot of memory then.");
-			if (os_grow_program_memory(new_program_size))
-				break;
-		}
-	}
-	
 	block->start = ((u8*)block)+sizeof(Heap_Block);
 	block->size = size;
 	block->next = 0;
@@ -547,14 +522,12 @@ get_temporary_allocator();
 
 #if !OOGABOOGA_LINK_EXTERNAL_INSTANCE
 thread_local void * temporary_storage = 0;
-thread_local bool   temporary_storage_initted = false;
 thread_local void * temporary_storage_pointer = 0;
 thread_local bool   has_warned_temporary_storage_overflow = false;
 thread_local Allocator temp_allocator;
 
 ogb_instance Allocator 
 get_temporary_allocator() {
-    if (!temporary_storage_initted) return get_initialization_allocator();
 	return temp_allocator;
 }
 #endif
@@ -563,7 +536,7 @@ ogb_instance void*
 temp_allocator_proc(u64 size, void *p, Allocator_Message message, void* data);
 
 ogb_instance void 
-temporary_storage_init();
+temporary_storage_init(u64 arena_size);
 
 ogb_instance void* 
 talloc(u64 size);
@@ -589,23 +562,19 @@ void* temp_allocator_proc(u64 size, void *p, Allocator_Message message, void* da
 	return 0;
 }
 
-void temporary_storage_init() {
-	if (temporary_storage_initted) return;
+void temporary_storage_init(u64 arena_size) {
 	
-	temporary_storage = heap_alloc(TEMPORARY_STORAGE_SIZE);
+	temporary_storage = heap_alloc(arena_size);
 	assert(temporary_storage, "Failed allocating temporary storage");
 	temporary_storage_pointer = temporary_storage;
 
 	temp_allocator.proc = temp_allocator_proc;
 	temp_allocator.data = 0;
 	
-	temporary_storage_initted = true;
-	
 	temp_allocator.proc = temp_allocator_proc;
 }
 
 void* talloc(u64 size) {
-	if (!temporary_storage_initted) temporary_storage_init();
 	
 	assert(size < TEMPORARY_STORAGE_SIZE, "Bruddah this is too large for temp allocator");
 	
@@ -625,10 +594,7 @@ void* talloc(u64 size) {
 }
 
 void reset_temporary_storage() {
-	if (!temporary_storage_initted) temporary_storage_init();
-	
-	temporary_storage_pointer = temporary_storage;
-	
+	temporary_storage_pointer = temporary_storage;	
 	has_warned_temporary_storage_overflow = true;
 }
 
diff --git a/oogabooga/oogabooga.c b/oogabooga/oogabooga.c
index 975a8c2..034ca35 100644
--- a/oogabooga/oogabooga.c
+++ b/oogabooga/oogabooga.c
@@ -385,7 +385,7 @@ void oogabooga_init(u64 program_memory_size) {
 	Cpu_Capabilities features = query_cpu_capabilities();
 	os_init(program_memory_size);
 	heap_init();
-	temporary_storage_init();
+	temporary_storage_init(TEMPORARY_STORAGE_SIZE);
 	log_info("Ooga booga version is %d.%02d.%03d", OGB_VERSION_MAJOR, OGB_VERSION_MINOR, OGB_VERSION_PATCH);
 #ifndef OOGABOOGA_HEADLESS
 	gfx_init();
diff --git a/oogabooga/os_impl_windows.c b/oogabooga/os_impl_windows.c
index efc9eb3..8950f66 100644
--- a/oogabooga/os_impl_windows.c
+++ b/oogabooga/os_impl_windows.c
@@ -12,6 +12,59 @@
 void* heap_alloc(u64);
 void heap_dealloc(void*);
 
+u16 *win32_fixed_utf8_to_null_terminated_wide(string utf8, Allocator allocator) {
+
+	if (utf8.count == 0) {
+		u16 *utf16_str = (u16 *)alloc(allocator, (1) * sizeof(u16));
+		*utf16_str = 0;
+		return utf16_str;
+	}
+
+    u64 utf16_length = MultiByteToWideChar(CP_UTF8, 0, (LPCCH)utf8.data, (int)utf8.count, 0, 0);
+
+    u16 *utf16_str = (u16 *)alloc(allocator, (utf16_length + 1) * sizeof(u16));
+
+    int result = MultiByteToWideChar(CP_UTF8, 0, (LPCCH)utf8.data, (int)utf8.count, utf16_str, utf16_length);
+    if (result == 0) {
+        dealloc(allocator, utf16_str);
+        return 0;
+    }
+
+    utf16_str[utf16_length] = 0;
+
+    return utf16_str;
+}
+u16 *temp_win32_fixed_utf8_to_null_terminated_wide(string utf8) {
+	return win32_fixed_utf8_to_null_terminated_wide(utf8, get_temporary_allocator());
+}
+string win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16, Allocator allocator) {
+    u64 utf8_length = WideCharToMultiByte(CP_UTF8, 0, (LPCWCH)utf16, -1, 0, 0, 0, 0);
+
+	if (utf8_length == 0) {
+		string utf8;
+		utf8.count = 0;
+		utf8.data = 0;
+		return utf8;
+	}
+
+    u8 *utf8_str = (u8 *)alloc(allocator, utf8_length * sizeof(u8));
+
+    int result = WideCharToMultiByte(CP_UTF8, 0, (LPCWCH)utf16, -1, (LPSTR)utf8_str, (int)utf8_length, 0, 0);
+    if (result == 0) {
+        dealloc(allocator, utf8_str);
+        return (string){0, 0};
+    }
+
+    string utf8;
+    utf8.data = utf8_str;
+    utf8.count = utf8_length-1;
+
+    return utf8;
+}
+
+string temp_win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16) {
+    return win32_null_terminated_wide_to_fixed_utf8(utf16, get_temporary_allocator());
+}
 #define win32_check_hr(hr) win32_check_hr_impl(hr, __LINE__, __FILE__);
 void win32_check_hr_impl(HRESULT hr, u32 line, const char* file_name) {
     if (hr != S_OK) {
@@ -30,12 +83,21 @@ void win32_check_hr_impl(HRESULT hr, u32 line, const char* file_name) {
             0,
             NULL );
 
+		u16 *wide_err = 0;
+
         if (messageLength > 0) {
-            MessageBoxW(NULL, (LPWSTR)errorMsg, L"Error", MB_OK | MB_ICONERROR);
+        	wide_err = (LPWSTR)errorMsg;
         } else {
-            MessageBoxW(NULL, L"Failed to retrieve error message.", L"Error", MB_OK | MB_ICONERROR);
+        	wide_err = (u16*)L"Failed to retrieve error message.";
         }
     
+    	string utf8_err = temp_win32_null_terminated_wide_to_fixed_utf8(wide_err);
+    	
+    	string final_message_utf8 = tprint("%s\nIn file %cs on line %d", utf8_err, file_name, line);
+    	
+    	u16 *final_message_wide = temp_win32_fixed_utf8_to_null_terminated_wide(final_message_utf8);
+    
+        MessageBoxW(NULL, final_message_wide, L"Error", MB_OK | MB_ICONERROR);
 
         panic("win32 hr failed in file %cs on line %d, hr was %d", file_name, line, hr);
     }
@@ -266,11 +328,19 @@ win32_audio_thread(Thread *t);
 void 
 win32_audio_poll_default_device_thread(Thread *t);
 
-bool win32_has_audio_thread_started = false;
+volatile bool win32_has_audio_thread_started = false;
 #endif /* OOGABOOGA_HEADLESS */
 
-void os_init(u64 program_memory_size) {
+void os_init(u64 program_memory_capacity) {
 	
+    // #Volatile
+    // Any printing uses vsnprintf, and printing may happen in init,
+    // especially on errors, so this needs to happen first.
+    os.crt = os_load_dynamic_library(STR("msvcrt.dll"));
+	assert(os.crt != 0, "Could not load win32 crt library. Might be compiled with non-msvc? #Incomplete #Portability");
+	os.crt_vsnprintf = (Crt_Vsnprintf_Proc)os_dynamic_library_load_symbol(os.crt, STR("vsnprintf"));
+	assert(os.crt_vsnprintf, "Missing vsnprintf in crt");
+
 #if CONFIGURATION == DEBUG
 	HANDLE process = GetCurrentProcess();
 	SymInitialize(process, NULL, TRUE);
@@ -284,7 +354,10 @@ void os_init(u64 program_memory_size) {
 
 
 #if CONFIGURATION == RELEASE
+	// #Configurable #Copypaste
 	SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
+	SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
+	timeBeginPeriod(1);
 #endif
 
 	SetProcessDpiAwarenessContext(DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2);
@@ -314,14 +387,11 @@ void os_init(u64 program_memory_size) {
 
 
 	program_memory_mutex = os_make_mutex();
-	os_grow_program_memory(program_memory_size);
+	os_grow_program_memory(program_memory_capacity);
 	
 	heap_init();
 	
-	os.crt = os_load_dynamic_library(STR("msvcrt.dll"));
-	assert(os.crt != 0, "Could not load win32 crt library. Might be compiled with non-msvc? #Incomplete #Portability");
-	os.crt_vsnprintf = (Crt_Vsnprintf_Proc)os_dynamic_library_load_symbol(os.crt, STR("vsnprintf"));
-	assert(os.crt_vsnprintf, "Missing vsnprintf in crt");
+	
 	
 #ifndef OOGABOOGA_HEADLESS
     win32_init_window();
@@ -380,71 +450,7 @@ void s64_to_null_terminated_string(s64 num, char* str, int base)
     s64_to_null_terminated_string_reverse(str, i);
 }
 
-bool os_grow_program_memory(u64 new_size) {
-	os_lock_mutex(program_memory_mutex); // #Sync
-	if (program_memory_size >= new_size) {
-		os_unlock_mutex(program_memory_mutex); // #Sync
-		return true;
-	}
 
-	
-	
-	bool is_first_time = program_memory == 0;
-	
-	if (is_first_time) {
-		u64 aligned_size = (new_size+os.granularity) & ~(os.granularity);
-		void* aligned_base = (void*)(((u64)VIRTUAL_MEMORY_BASE+os.granularity) & ~(os.granularity-1));
-
-		program_memory = VirtualAlloc(aligned_base, aligned_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
-		if (program_memory == 0) { 
-			os_unlock_mutex(program_memory_mutex); // #Sync
-			return false;
-		}
-		program_memory_size = aligned_size;
-		
-		memset(program_memory, 0xBA, program_memory_size);
-	} else {
-		// #Cleanup this mess
-		// Allocation size doesn't actually need to be aligned to granularity, page size is enough.
-		// Doesn't matter that much tho, but this is just a bit unfortunate to look at.
-		void* tail = (u8*)program_memory + program_memory_size;
-		u64 m = ((u64)program_memory_size % os.granularity);
-		assert(m == 0, "program_memory_size is not aligned to granularity!");
-		m = ((u64)tail % os.granularity);
-		assert(m == 0, "Tail is not aligned to granularity!");
-		u64 amount_to_allocate = new_size-program_memory_size;
-		amount_to_allocate = ((amount_to_allocate+os.granularity)&~(os.granularity-1));
-		m = ((u64)amount_to_allocate % os.granularity);
-		assert(m == 0, "amount_to_allocate is not aligned to granularity!");
-		// Just keep allocating at the tail of the current chunk
-		void* result = VirtualAlloc(tail, amount_to_allocate, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
-		assert(result == tail);
-#if CONFIGURATION == DEBUG
-		volatile u8 a = *(u8*)tail = 69;
-#endif
-		memset(result, 0xBA, amount_to_allocate);
-		if (result == 0) { 
-			os_unlock_mutex(program_memory_mutex); // #Sync
-			return false;
-		}
-		assert(tail == result, "It seems tail is not aligned properly. o nein");
-		
-		program_memory_size += amount_to_allocate;
-
-		m = ((u64)program_memory_size % os.granularity);
-		assert(m == 0, "program_memory_size is not aligned to granularity!");
-	}
-
-	
-	char size_str[32];
-	s64_to_null_terminated_string(program_memory_size/1024, size_str, 10);
-	
-	os_write_string_to_stdout(STR("Program memory grew to "));
-	os_write_string_to_stdout(STR(size_str));
-	os_write_string_to_stdout(STR(" kb\n"));
-	os_unlock_mutex(program_memory_mutex); // #Sync
-	return true;
-}
 
 
 ///
@@ -458,15 +464,25 @@ bool os_grow_program_memory(u64 new_size) {
 
 DWORD WINAPI win32_thread_invoker(LPVOID param) {
 
-#if CONFIGURATION == RELEASE
-	SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
-#endif
 
 	Thread *t = (Thread*)param;
-	temporary_storage_init();
+	
+#if CONFIGURATION == RELEASE
+	// #Configurable #Copypaste
+	SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
+	SetThreadPriority(t->os_handle, THREAD_PRIORITY_TIME_CRITICAL);
+	timeBeginPeriod(1);
+#endif
+	
+	temporary_storage_init(t->temporary_storage_size);
+	
 	context = t->initial_context;
 	context.thread_id = GetCurrentThreadId();
+	
 	t->proc(t);
+	
+	heap_dealloc(temporary_storage);
+	
 	return 0;
 }
 
@@ -508,6 +524,7 @@ void os_thread_init(Thread *t, Thread_Proc proc) {
 	t->id = 0;
 	t->proc = proc;
 	t->initial_context = context;
+	t->temporary_storage_size = KB(10);
 }
 void os_thread_destroy(Thread *t) {
 	os_thread_join(t);
@@ -649,59 +666,7 @@ void os_write_string_to_stdout(string s) {
 	WriteFile(win32_stdout, s.data, s.count, 0, 0);
 }
 
-u16 *win32_fixed_utf8_to_null_terminated_wide(string utf8, Allocator allocator) {
 
-	if (utf8.count == 0) {
-		u16 *utf16_str = (u16 *)alloc(allocator, (1) * sizeof(u16));
-		*utf16_str = 0;
-		return utf16_str;
-	}
-
-    u64 utf16_length = MultiByteToWideChar(CP_UTF8, 0, (LPCCH)utf8.data, (int)utf8.count, 0, 0);
-
-    u16 *utf16_str = (u16 *)alloc(allocator, (utf16_length + 1) * sizeof(u16));
-
-    int result = MultiByteToWideChar(CP_UTF8, 0, (LPCCH)utf8.data, (int)utf8.count, utf16_str, utf16_length);
-    if (result == 0) {
-        dealloc(allocator, utf16_str);
-        return 0;
-    }
-
-    utf16_str[utf16_length] = 0;
-
-    return utf16_str;
-}
-u16 *temp_win32_fixed_utf8_to_null_terminated_wide(string utf8) {
-	return win32_fixed_utf8_to_null_terminated_wide(utf8, get_temporary_allocator());
-}
-string win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16, Allocator allocator) {
-    u64 utf8_length = WideCharToMultiByte(CP_UTF8, 0, (LPCWCH)utf16, -1, 0, 0, 0, 0);
-
-	if (utf8_length == 0) {
-		string utf8;
-		utf8.count = 0;
-		utf8.data = 0;
-		return utf8;
-	}
-
-    u8 *utf8_str = (u8 *)alloc(allocator, utf8_length * sizeof(u8));
-
-    int result = WideCharToMultiByte(CP_UTF8, 0, (LPCWCH)utf16, -1, (LPSTR)utf8_str, (int)utf8_length, 0, 0);
-    if (result == 0) {
-        dealloc(allocator, utf8_str);
-        return (string){0, 0};
-    }
-
-    string utf8;
-    utf8.data = utf8_str;
-    utf8.count = utf8_length-1;
-
-    return utf8;
-}
-
-string temp_win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16) {
-    return win32_null_terminated_wide_to_fixed_utf8(utf16, get_temporary_allocator());
-}
 
 
 File os_file_open_s(string path, Os_Io_Open_Flags flags) {
@@ -1186,6 +1151,116 @@ os_get_stack_trace(u64 *trace_count, Allocator allocator) {
 #endif // NOT DEBUG
 }
 
+bool os_grow_program_memory(u64 new_size) {
+	os_lock_mutex(program_memory_mutex); // #Sync
+	if (program_memory_capacity >= new_size) {
+		os_unlock_mutex(program_memory_mutex); // #Sync
+		return true;
+	}
+
+	
+	
+	bool is_first_time = program_memory == 0;
+	
+	if (is_first_time) {
+		// It's fine to allocate a region with size only aligned to page size, BUT,
+		// since we allocate each region with the base address at the tail of the
+		// previous region, then that tail needs to be aligned to granularity, which
+		// will be true if the size is also always aligned to granularity.
+		u64 aligned_size = (new_size+os.granularity) & ~(os.granularity);
+		void* aligned_base = (void*)(((u64)VIRTUAL_MEMORY_BASE+os.granularity) & ~(os.granularity-1));
+
+		program_memory = VirtualAlloc(aligned_base, aligned_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+		if (program_memory == 0) { 
+			os_unlock_mutex(program_memory_mutex); // #Sync
+			return false;
+		}
+		program_memory_next = program_memory;
+		program_memory_capacity = aligned_size;
+#if CONFIGURATION == DEBUG
+		memset(program_memory, 0xBA, program_memory_capacity);
+        DWORD _ = PAGE_READWRITE;
+		VirtualProtect(aligned_base, aligned_size, PAGE_NOACCESS, &_);
+#endif
+	} else {
+		void* tail = (u8*)program_memory + program_memory_capacity;
+		
+		assert((u64)program_memory_capacity % os.granularity == 0, "program_memory_capacity is not aligned to granularity!");
+		assert((u64)tail % os.granularity == 0, "Tail is not aligned to granularity!");
+		
+		u64 amount_to_allocate = new_size-program_memory_capacity;
+		amount_to_allocate = ((amount_to_allocate+os.granularity)&~(os.granularity-1));
+		
+		// Just keep allocating at the tail of the current chunk
+		void* result = VirtualAlloc(tail, amount_to_allocate, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+#if CONFIGURATION == DEBUG
+		memset(result, 0xBA, amount_to_allocate);
+		DWORD _ = PAGE_READWRITE;
+		VirtualProtect(tail, amount_to_allocate, PAGE_NOACCESS, &_);
+#endif
+		if (result == 0) { 
+			os_unlock_mutex(program_memory_mutex); // #Sync
+			return false;
+		}
+		assert(tail == result, "It seems tail is not aligned properly. o nein");
+		assert((u64)program_memory_capacity % os.granularity == 0, "program_memory_capacity is not aligned to granularity!");
+		
+		program_memory_capacity += amount_to_allocate;
+	}
+
+	
+	char size_str[32];
+	s64_to_null_terminated_string(program_memory_capacity/1024, size_str, 10);
+	
+	os_write_string_to_stdout(STR("Program memory grew to "));
+	os_write_string_to_stdout(STR(size_str));
+	os_write_string_to_stdout(STR(" kb\n"));
+	os_unlock_mutex(program_memory_mutex); // #Sync
+	return true;
+}
+
+void*
+os_reserve_next_memory_pages(u64 size) {
+	assert(size % os.page_size == 0, "size was not aligned to page size in os_reserve_next_memory_pages");
+
+	void *p = program_memory_next;
+	
+	program_memory_next = (u8*)program_memory_next + size;
+	
+	void *program_tail = (u8*)program_memory + program_memory_capacity;
+	
+	if ((u64)program_memory_next > (u64)program_tail) {
+		u64 minimum_size = ((u64)program_memory_next) - (u64)program_memory + 1;
+		u64 new_program_size = get_next_power_of_two(minimum_size);
+		
+		const u64 ATTEMPTS = 1000;
+		for (u64 i = 0; i <= ATTEMPTS; i++) {
+			if (program_memory_capacity >= new_program_size) break; // Another thread might have resized already, causing it to fail here.
+			assert(i < ATTEMPTS, "OS is not letting us allocate more memory. Maybe we are out of memory? You sure must be using a lot of memory then.");
+			if (os_grow_program_memory(new_program_size))
+				break;
+		}
+	}
+	
+	return p;
+}
+
+void
+os_unlock_program_memory_pages(void *start, u64 size) {
+#if CONFIGURATION == DEBUG
+	assert((u64)start % os.page_size == 0, "When unlocking memory pages, the start address must be the start of a page");
+	assert(size       % os.page_size == 0, "When unlocking memory pages, the size must be aligned to page_size");
+	// This memory may be across multiple allocated regions so we need to do this one page at a time.
+	// Probably super slow but this shouldn't happen often at all + it's only in debug.
+	// - Charlie M 28th July 2024
+	for (u8 *p = (u8*)start; p < (u8*)start+size; p += os.page_size) {
+		DWORD old_protect = PAGE_NOACCESS;
+		BOOL ok = VirtualProtect(p, os.page_size, PAGE_READWRITE, &old_protect);
+		assert(ok, "VirtualProtect Failed with error %d", GetLastError());
+	}
+#endif
+}
+
 ///
 ///
 // Mouse pointer
@@ -1460,7 +1535,6 @@ win32_audio_init() {
 void
 win32_audio_poll_default_device_thread(Thread *t) {
 	while (!win32_has_audio_thread_started) {
-		MEMORY_BARRIER;
 		os_yield_thread();
 	}
 
@@ -1470,9 +1544,7 @@ win32_audio_poll_default_device_thread(Thread *t) {
 		}
 		
 		mutex_acquire_or_wait(&audio_init_mutex);
-		MEMORY_BARRIER;
 	    mutex_release(&audio_init_mutex);
-		MEMORY_BARRIER;
 	
 		IMMDevice *now_default = 0;
 		HRESULT hr = IMMDeviceEnumerator_GetDefaultAudioEndpoint(win32_device_enumerator, eRender, eConsole, &now_default);
@@ -1507,11 +1579,8 @@ win32_audio_thread(Thread *t) {
 	
     mutex_acquire_or_wait(&audio_init_mutex);
     win32_has_audio_thread_started = true;
-    MEMORY_BARRIER;
 	win32_audio_init();
     mutex_release(&audio_init_mutex);
-        
-    timeBeginPeriod(1);
 	
 	u32 buffer_frame_count;
     HRESULT hr = IAudioClient_GetBufferSize(win32_audio_client, &buffer_frame_count);
diff --git a/oogabooga/os_interface.c b/oogabooga/os_interface.c
index 41948c0..8ee3ce8 100644
--- a/oogabooga/os_interface.c
+++ b/oogabooga/os_interface.c
@@ -87,8 +87,7 @@ inline int vsnprintf(char* buffer, size_t n, const char* fmt, va_list args) {
 
 
 
-bool ogb_instance
-os_grow_program_memory(size_t new_size);
+
 
 ///
 ///
@@ -103,8 +102,11 @@ typedef struct Thread {
 	u64 id; // This is valid after os_thread_start
 	Context initial_context;
 	void* data;
+	u64 temporary_storage_size; // Defaults to KB(10)
 	Thread_Proc proc;
 	Thread_Handle os_handle;
+	
+	
 	Allocator allocator;  // Deprecated !! #Cleanup
 } Thread;
 
@@ -395,7 +397,8 @@ os_get_number_of_logical_processors();
 ogb_instance string*
 os_get_stack_trace(u64 *trace_count, Allocator allocator);
 
-void dump_stack_trace() {
+inline void 
+dump_stack_trace() {
 	u64 count;
 	string *strings = os_get_stack_trace(&count, get_temporary_allocator());
 	
@@ -405,6 +408,38 @@ void dump_stack_trace() {
 	}
 }
 
+
+///
+///
+// Memory
+///
+
+// #Global
+ogb_instance void *program_memory;
+ogb_instance void *program_memory_next;
+ogb_instance u64 program_memory_capacity;
+ogb_instance Mutex_Handle program_memory_mutex;
+
+#if !OOGABOOGA_LINK_EXTERNAL_INSTANCE
+void *program_memory = 0;
+void *program_memory_next = 0;
+u64 program_memory_capacity = 0;
+Mutex_Handle program_memory_mutex = 0;
+#endif // NOT OOGABOOGA_LINK_EXTERNAL_INSTANCE
+
+bool ogb_instance
+os_grow_program_memory(size_t new_size);
+
+// BEWARE:
+// - size must be aligned to os.page_size
+// - Pages will not always belong to the same region (although they will be contigious in virtual adress space)
+// - Pages will be locked (Win32 PAGE_NOACCESS) so you need to unlock with os_unlock_program_memory_pages() before use.
+ogb_instance void*
+os_reserve_next_memory_pages(u64 size);
+
+void ogb_instance
+os_unlock_program_memory_pages(void *start, u64 size);
+
 ///
 ///
 // Mouse pointer
@@ -450,5 +485,4 @@ void ogb_instance
 os_init(u64 program_memory_size);
 
 void ogb_instance
-os_update();
-
+os_update();
\ No newline at end of file
diff --git a/oogabooga/profiling.c b/oogabooga/profiling.c
index 3a6c859..5694e55 100644
--- a/oogabooga/profiling.c
+++ b/oogabooga/profiling.c
@@ -40,17 +40,17 @@ void _profiler_report_time_cycles(string name, u64 count, u64 start) {
 }
 #if ENABLE_PROFILING
 #define tm_scope(name) \
-    for (u64 start_time = os_get_current_cycle_count(), end_time = start_time, elapsed_time = 0; \
+    for (u64 start_time = rdtsc(), end_time = start_time, elapsed_time = 0; \
          elapsed_time == 0; \
-         elapsed_time = (end_time = os_get_current_cycle_count()) - start_time, _profiler_report_time_cycles(STR(name), elapsed_time, start_time))
+         elapsed_time = (end_time = rdtsc()) - start_time, _profiler_report_time_cycles(STR(name), elapsed_time, start_time))
 #define tm_scope_var(name, var) \
-    for (u64 start_time = os_get_current_cycle_count(), end_time = start_time, elapsed_time = 0; \
+    for (u64 start_time = rdtsc(), end_time = start_time, elapsed_time = 0; \
          elapsed_time == 0; \
-         elapsed_time = (end_time = os_get_current_cycle_count()) - start_time, var=elapsed_time)
+         elapsed_time = (end_time = rdtsc()) - start_time, var=elapsed_time)
 #define tm_scope_accum(name, var) \
-    for (u64 start_time = os_get_current_cycle_count(), end_time = start_time, elapsed_time = 0; \
+    for (u64 start_time = rdtsc(), end_time = start_time, elapsed_time = 0; \
          elapsed_time == 0; \
-         elapsed_time = (end_time = os_get_current_cycle_count()) - start_time, var+=elapsed_time)
+         elapsed_time = (end_time = rdtsc()) - start_time, var+=elapsed_time)
 #else
 	#define tm_scope(...)
 	#define tm_scope_var(...)
diff --git a/oogabooga/tests.c b/oogabooga/tests.c
index 21657cd..e55c7ef 100644
--- a/oogabooga/tests.c
+++ b/oogabooga/tests.c
@@ -1208,7 +1208,7 @@ void test_mutex() {
 
     Allocator allocator = get_heap_allocator();
 
-	const int num_threads = 100;
+	const int num_threads = 1000;
 
 	Thread *threads = alloc(allocator, sizeof(Thread)*num_threads);
 	for (u64 i = 0; i < num_threads; i++) {