- Z Layers

- Sorting - Contiguous quad buffer
2024-07-10 17:10:38 +02:00 · 2024-07-10 17:10:38 +02:00 · c92b6fd4b7
commit c92b6fd4b7
parent a9cbf7ee68
15 changed files with 386 additions and 254 deletions
--- a/build.c
+++ b/build.c
@ -3,6 +3,11 @@
 ///
 // Build config stuff

+// #Temporary
+#define RUN_TESTS 0
+#define OOGABOOGA_DEV 1
+#define ENABLE_PROFILING 1
+
 #define INITIAL_PROGRAM_MEMORY_SIZE MB(5)

 typedef struct Context_Extra {
@ -26,11 +31,11 @@ typedef struct Context_Extra {
 //

 // this is a minimal starting point for new projects. Copy & rename to get started
-#include "oogabooga/examples/minimal_game_loop.c"
+// #include "oogabooga/examples/minimal_game_loop.c"

 // #include "oogabooga/examples/text_rendering.c"
 // #include "oogabooga/examples/custom_logger.c"
-// #include "oogabooga/examples/renderer_stress_test.c"
+#include "oogabooga/examples/renderer_stress_test.c"

 // This is where you swap in your own project!
 // #include "entry_yourepicgamename.c"
--- a/changelog.txt
+++ b/changelog.txt
@ -1,3 +1,17 @@
+## v0.00.005 - Z layers
+Renderer:
+	- Added optional Z-sorting
+		- Either set quad->z or call push_z_layer(s64) (and pop_z_layer())
+		- Enable with draw_frame.enable_z_sorting = true
+	- Refactored the quad buffering to just be a growing quad buffer rather than a linked list of quad blocks. Your CPU will be thankful.
+	
+Misc:
+	- removed gfx_impl_legacy_opengl.c
+	- Sorting procedures
+		- merge_sort()
+		- radix_sort()
+	- sorting tests
+
 ## v0.00.004 - Custom logging, more concurrency & bugfixing

 Concurrency:
--- a/oogabooga/base.c
+++ b/oogabooga/base.c
@ -78,6 +78,7 @@ typedef struct Allocator {
 } Allocator;

 Allocator get_heap_allocator();
+Allocator get_temporary_allocator();

 typedef struct Context {
 	void *logger; // void(*Logger_Proc)(Log_Level level, string fmt, ...)
--- a/oogabooga/drawing.c
+++ b/oogabooga/drawing.c
@ -62,13 +62,11 @@ Usage:
 	
 */

+// We use radix sort so the exact bit count is of important
+#define MAX_Z_BITS 21
+#define MAX_Z ((1 << MAX_Z_BITS)/2)
+#define Z_STACK_MAX 4096

-
-
-
-
-
-#define QUADS_PER_BLOCK 256
 typedef struct Draw_Quad {
 	Vector2 bottom_left, top_left, top_right, bottom_right;
 	// r, g, b, a
@ -82,33 +80,22 @@ typedef struct Draw_Quad {
 	Gfx_Filter_Mode image_min_filter;
 	Gfx_Filter_Mode image_mag_filter;
 	
-	float32 z;
+	s32 z;
 	
 } Draw_Quad;


-typedef struct Draw_Quad_Block {
-	Draw_Quad quad_buffer[QUADS_PER_BLOCK];
-	u64 num_quads;
-	
-	float32 low_z, high_z;
-	
-	struct Draw_Quad_Block *next;
-} Draw_Quad_Block;
-
-// I made these blocks part of the frame at first so they were temp allocated BUT I think 
-// that was a mistake because these blocks  are accessed a lot so we want it to just be
-// persistent memory that's super hot all the time.
-Draw_Quad_Block first_block = {0};
-
+Draw_Quad *quad_buffer;
+u64 allocated_quads;
 typedef struct Draw_Frame {
-	Draw_Quad_Block *current;
-	u64 num_blocks;
+	u64 num_quads;
 	
 	Matrix4 projection;
 	Matrix4 view;
 	
 	bool enable_z_sorting;
+	s32 z_stack[Z_STACK_MAX];
+	u64 z_count;
 } Draw_Frame;
 // This frame is passed to the platform layer and rendered in os_update.
 // Resets every frame.
@ -117,14 +104,21 @@ Draw_Frame draw_frame = ZERO(Draw_Frame);
 void reset_draw_frame(Draw_Frame *frame) {
 	*frame = (Draw_Frame){0};
 	
-	frame->current = 0;
-	
 	float32 aspect = (float32)window.width/(float32)window.height;
 	
 	frame->projection = m4_make_orthographic_projection(-aspect, aspect, -1, 1, -1, 10);
 	frame->view = m4_scalar(1.0);	
+}

-	frame->num_blocks = 0;
+void push_z_layer(s32 z) {
+	assert(draw_frame.z_count < Z_STACK_MAX, "Too many z layers pushed. You can pop with pop_z_layer() when you are done drawing to it.");
+	
+	draw_frame.z_stack[draw_frame.z_count] = z;
+	draw_frame.z_count += 1;
+}
+void pop_z_layer() {
+	assert(draw_frame.z_count > 0, "No Z layers to pop!");
+	draw_frame.z_count -= 1;
 }

 Draw_Quad *draw_quad_projected(Draw_Quad quad, Matrix4 world_to_clip) {
@ -136,36 +130,29 @@ Draw_Quad *draw_quad_projected(Draw_Quad quad, Matrix4 world_to_clip) {
 	quad.image_min_filter = GFX_FILTER_MODE_NEAREST;
 	quad.image_mag_filter = GFX_FILTER_MODE_NEAREST;
 	
-	if (!draw_frame.current) {
-		draw_frame.current = &first_block;
-		draw_frame.current->low_z = F32_MAX;
-		draw_frame.current->high_z = F32_MIN;
-		draw_frame.current->num_quads = 0;
-		draw_frame.num_blocks = 1;
+	quad.z = 0;
+	if (draw_frame.z_count > 0)  quad.z = draw_frame.z_stack[draw_frame.z_count-1];
+	
+	if (draw_frame.num_quads >= allocated_quads) {
+		// #Memory
+		
+		u64 new_count = max(get_next_power_of_two(draw_frame.num_quads+1), 128);
+		
+		Draw_Quad *new_buffer = alloc(get_heap_allocator(), new_count*sizeof(Draw_Quad));
+		
+		if (quad_buffer) {
+			memcpy(new_buffer, quad_buffer, draw_frame.num_quads*sizeof(Draw_Quad));
+			dealloc(get_heap_allocator(), quad_buffer);
 		}
 		
-	assert(draw_frame.current->num_quads <= QUADS_PER_BLOCK);
-	
-	if (draw_frame.current->num_quads == QUADS_PER_BLOCK) {
-		
-		if (!draw_frame.current->next) {
-			draw_frame.current->next = cast(Draw_Quad_Block*)alloc(get_heap_allocator(), sizeof(Draw_Quad_Block));
-			*draw_frame.current->next = ZERO(Draw_Quad_Block);
+		quad_buffer = new_buffer;
+		allocated_quads = new_count;
 	}
 	
-		draw_frame.current = draw_frame.current->next;
-		draw_frame.current->num_quads = 0;
-		draw_frame.current->low_z = F32_MAX;
-		draw_frame.current->high_z = F32_MIN;
+	quad_buffer[draw_frame.num_quads] = quad;
+	draw_frame.num_quads += 1;
 	
-		draw_frame.num_blocks += 1;
-		
-	}
-	
-	draw_frame.current->quad_buffer[draw_frame.current->num_quads] = quad;
-	draw_frame.current->num_quads += 1;
-	
-	return &draw_frame.current->quad_buffer[draw_frame.current->num_quads-1];
+	return &quad_buffer[draw_frame.num_quads-1];
 }
 Draw_Quad *draw_quad(Draw_Quad quad) {
 	return draw_quad_projected(quad, m4_mul(draw_frame.projection, m4_inverse(draw_frame.view)));
--- a/oogabooga/examples/minimal_game_loop.c
+++ b/oogabooga/examples/minimal_game_loop.c
@ -19,6 +19,8 @@ int entry(int argc, char **argv) {
 		rect_xform         = m4_translate(rect_xform, v3(-.25f, -.25f, 0));
 		draw_rect_xform(rect_xform, v2(.5f, .5f), COLOR_GREEN);
 		
+		draw_rect(v2(sin(now), -.8), v2(.5, .25), COLOR_RED);
+		
 		gfx_update();
 	}

--- a/oogabooga/examples/renderer_stress_test.c
+++ b/oogabooga/examples/renderer_stress_test.c
@ -87,8 +87,12 @@ int entry(int argc, char **argv) {
 		camera_view = m4_translate(camera_view, v3(v2_expand(cam_move), 0));
 		draw_frame.view = camera_view;
 		
+		local_persist bool do_enable_z_sorting = false;
+		draw_frame.enable_z_sorting = do_enable_z_sorting;
+		if (is_key_just_pressed('Z')) do_enable_z_sorting = !do_enable_z_sorting;
+		
 		seed_for_random = 69;
-		for (u64 i = 0; i < 100000; i++) {
+		for (u64 i = 0; i < 50000; i++) {
 			float32 aspect = (float32)window.width/(float32)window.height;
 			float min_x = -aspect;
 			float max_x = aspect;
@ -98,15 +102,18 @@ int entry(int argc, char **argv) {
 			float x = get_random_float32() * (max_x-min_x) + min_x;
 			float y = get_random_float32() * (max_y-min_y) + min_y;
 			
+			push_z_layer((s32)(y*100));
 			draw_image(bush_image, v2(x, y), v2(0.1, 0.1), COLOR_WHITE);
+			pop_z_layer();
 		}
 		seed_for_random = os_get_current_cycle_count();
 		
-		
 		Matrix4 hammer_xform = m4_scalar(1.0);
 		hammer_xform         = m4_rotate_z(hammer_xform, (f32)now);
 		hammer_xform         = m4_translate(hammer_xform, v3(-.25f, -.25f, 0));
+		push_z_layer(1000001);
 		draw_image_xform(hammer_image, hammer_xform, v2(.5f, .5f), COLOR_RED);
+		pop_z_layer();
 		
 		Vector2 hover_position = v2_rotate_point_around_pivot(v2(-.5, -.5), v2(0, 0), (f32)now);
 		Vector2 local_pivot = v2(.125f, .125f);
--- a/oogabooga/gfx_impl_d3d11.c
+++ b/oogabooga/gfx_impl_d3d11.c
@ -15,7 +15,6 @@ string temp_win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16);

 typedef struct alignat(16) D3D11_Vertex {
 	
-
 	Vector4 color;
 	Vector4 position;
 	Vector2 uv;
@ -59,6 +58,9 @@ ID3D11Buffer *d3d11_quad_vbo = 0;
 u32 d3d11_quad_vbo_size = 0;
 void *d3d11_staging_quad_buffer = 0;

+Draw_Quad *sort_quad_buffer = 0;
+u64 sort_quad_buffer_size = 0;
+
 const char* d3d11_stringify_category(D3D11_MESSAGE_CATEGORY category) {
    switch (category) {
    case D3D11_MESSAGE_CATEGORY_APPLICATION_DEFINED: return "Application Defined";
@ -545,7 +547,7 @@ void d3d11_process_draw_frame() {
 	
 	///
 	// Maybe grow quad vbo
-	u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6;
+	u32 required_size = sizeof(D3D11_Vertex) * allocated_quads*6;

 	if (required_size > d3d11_quad_vbo_size) {
 		if (d3d11_quad_vbo) {
@ -567,7 +569,7 @@ void d3d11_process_draw_frame() {
 		log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size);
 	}

-	if (draw_frame.num_blocks > 0) {
+	if (draw_frame.num_quads > 0) {
 		///
 		// Render geometry from into vbo quad list
 	    
@ -580,14 +582,24 @@ void d3d11_process_draw_frame() {
 		D3D11_Vertex* head = (D3D11_Vertex*)d3d11_staging_quad_buffer;
 		D3D11_Vertex* pointer = head;
 		u64 number_of_rendered_quads = 0;
-		Draw_Quad_Block *block = &first_block;
 		
 		tm_scope_cycles("Quad processing") {
-			u64 block_index = 0;
-			while (block != 0 && block->num_quads > 0 && block_index < draw_frame.num_blocks) tm_scope_cycles("Quad block") {
-				for (u64 i = 0; i < block->num_quads; i++)  {
+			if (draw_frame.enable_z_sorting) tm_scope_cycles("Z sorting") {
+				if (!sort_quad_buffer || (sort_quad_buffer_size < allocated_quads*sizeof(Draw_Quad))) {
+					// #Memory #Heapalloc
+					if (sort_quad_buffer) dealloc(get_heap_allocator(), sort_quad_buffer);
+					sort_quad_buffer = alloc(get_heap_allocator(), allocated_quads*sizeof(Draw_Quad));
+					sort_quad_buffer_size = allocated_quads*sizeof(Draw_Quad);
+				}
+				radix_sort(quad_buffer, sort_quad_buffer, draw_frame.num_quads, sizeof(Draw_Quad), offsetof(Draw_Quad, z), MAX_Z_BITS);
+			}
 		
-					Draw_Quad *q = &block->quad_buffer[i];
+			for (u64 i = 0; i < draw_frame.num_quads; i++)  {
+				
+				Draw_Quad *q = &quad_buffer[i];
+				
+				assert(q->z <= MAX_Z, "Z is too high. Z is %d, Max is %d.", q->z, MAX_Z);
+				assert(q->z >= (-MAX_Z+1), "Z is too low. Z is %d, Min is %d.", q->z, -MAX_Z+1);
 				
 				s8 texture_index = -1;
 				
@ -691,10 +703,6 @@ void d3d11_process_draw_frame() {
 					number_of_rendered_quads += 1;
 				}
 			}
-				
-				block_index += 1;
-				block = block->next;
-			}
 		}
 		
 		tm_scope_cycles("Write to gpu") {
@ -738,7 +746,9 @@ void gfx_update() {

 	d3d11_process_draw_frame();

+	tm_scope_cycles("Present") {
 		VTABLE(Present, d3d11_swap_chain, window.enable_vsync, window.enable_vsync ? 0 : DXGI_PRESENT_ALLOW_TEARING);
+	}
 	
 	
 #if CONFIGURATION == DEBUG
--- a/oogabooga/gfx_impl_legacy_opengl.c
+++ b/oogabooga/gfx_impl_legacy_opengl.c
@ -1,80 +0,0 @@
-
-// #Temporary #Cleanup
-// #Temporary #Cleanup
-// #Temporary #Cleanup
-// #Temporary #Cleanup
-#include "GL/gl.h"
-HDC hdc;
-typedef BOOL (APIENTRY *PFNWGLSWAPINTERVALEXTPROC) (int interval);
-
-const Gfx_Handle GFX_INVALID_HANDLE = 0;
-
-void gfx_init() {
-	// #Temporary #Cleanup
-    // #Temporary #Cleanup
-    // #Temporary #Cleanup
-    // #Temporary #Cleanup
-    
-    PIXELFORMATDESCRIPTOR pfd = {
-        sizeof(PIXELFORMATDESCRIPTOR),
-        1,
-        PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER,
-        PFD_TYPE_RGBA,
-        32,
-        0, 0, 0, 0, 0, 0,
-        0, 0,
-        0, 0, 0, 0, 0,
-        24,
-        8,
-        0,
-        PFD_MAIN_PLANE,
-        0,
-        0, 0, 0
-    };
-
-    hdc = GetDC(window._os_handle);
-    int pixelFormat = ChoosePixelFormat(hdc, &pfd);
-    SetPixelFormat(hdc, pixelFormat, &pfd);
-
-    HGLRC hglrc = wglCreateContext(hdc);
-    wglMakeCurrent(hdc, hglrc);
-    
-    PFNWGLSWAPINTERVALEXTPROC wglSwapIntervalEXT = (PFNWGLSWAPINTERVALEXTPROC) wglGetProcAddress("wglSwapIntervalEXT");
-    
-    assert(wglSwapIntervalEXT, "Could not load wglSwapIntervalEXT");
-    
-    wglSwapIntervalEXT(0);
-    
-    glEnable(GL_BLEND);
-    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);  
-}
-
-void gfx_update() {
-	
-	Draw_Quad_Block *block = &draw_frame.first_block;
-	glBegin(GL_QUADS);
-	while (block != 0) {
-		
-		for (u64 i = 0; i < block->num_quads; i++) {
-			Draw_Quad q = block->quad_buffer[i];
-			
-			
-			glColor4f(v4_expand(q.color));
-			glVertex2f(v2_expand(q.bottom_left));
-			glVertex2f(v2_expand(q.top_left));
-			glVertex2f(v2_expand(q.top_right));
-			glVertex2f(v2_expand(q.bottom_right));
-			
-		}
-		
-		block = block->next;
-	}
-	glEnd();
-	
-	SwapBuffers(hdc);
-	glClearColor(window.clear_color.r, window.clear_color.g, window.clear_color.b, window.clear_color.a);
-	glClear(GL_COLOR_BUFFER_BIT);
-	glViewport(0, 0, window.width, window.height);
-	
-	draw_frame = ZERO(Draw_Frame);
-}
--- a/oogabooga/memory.c
+++ b/oogabooga/memory.c
@ -489,6 +489,9 @@ thread_local void * temporary_storage_pointer = 0;
 thread_local bool   has_warned_temporary_storage_overflow = false;
 thread_local Allocator temp;

+Allocator get_temporary_allocator() {
+	return temp;
+}

 void* temp_allocator_proc(u64 size, void *p, Allocator_Message message, void* data) {
 	switch (message) {
--- a/oogabooga/oogabooga.c
+++ b/oogabooga/oogabooga.c
@ -107,7 +107,7 @@

 #define OGB_VERSION_MAJOR 0
 #define OGB_VERSION_MINOR 0
-#define OGB_VERSION_PATCH 4
+#define OGB_VERSION_PATCH 5

 #define OGB_VERSION (OGB_VERSION_MAJOR*1000000+OGB_VERSION_MINOR*1000+OGB_VERSION_PATCH)

@ -266,6 +266,7 @@ typedef u8 bool;
 #include "path_utils.c"
 #include "linmath.c"
 #include "range.c"
+#include "utility.c"

 #include "hash_table.c"

@ -292,8 +293,6 @@ typedef u8 bool;
 	#error "We only have a D3D11 renderer at the moment"
 #elif GFX_RENDERER == GFX_RENDERER_METAL
 	#error "We only have a D3D11 renderer at the moment"
-#elif GFX_RENDERER == GFX_RENDERER_LEGACY_OPENGL
-	#include "gfx_impl_legacy_opengl.c"
 #else
 	#error "Unknown renderer GFX_RENDERER defined"
 #endif
--- a/oogabooga/os_impl_windows.c
+++ b/oogabooga/os_impl_windows.c
@ -139,7 +139,6 @@ void os_init(u64 program_memory_size) {
 	
 	memset(&window, 0, sizeof(window));
 	
-	timeBeginPeriod(1);
 #if CONFIGURATION == RELEASE
 	SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
 #endif
@ -364,7 +363,6 @@ bool os_grow_program_memory(u64 new_size) {

 DWORD WINAPI win32_thread_invoker(LPVOID param) {

-	timeBeginPeriod(1);
 #if CONFIGURATION == RELEASE
 	SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
 #endif
@ -518,13 +516,15 @@ void os_high_precision_sleep(f64 ms) {
 	s32 sleep_time = (s32)((end-start)-1.0);
 	bool do_sleep = sleep_time >= 1;
 	
-	timeBeginPeriod(1); // I don't see a reason to reset this
+	timeBeginPeriod(1);
 	
 	if (do_sleep)  os_sleep(sleep_time);
 	
 	while (os_get_current_time_in_seconds() < end) {
 		os_yield_thread();
 	}
+	
+	timeEndPeriod(1);
 }


--- a/oogabooga/profiling.c
+++ b/oogabooga/profiling.c
@ -24,7 +24,7 @@ void _profiler_report_time_cycles(string name, u64 count, u64 start) {
 	spinlock_acquire_or_wait(&_profiler_lock);
 	
 	string fmt = STR("{\"cat\":\"function\",\"dur\":%.3f,\"name\":\"%s\",\"ph\":\"X\",\"pid\":0,\"tid\":%zu,\"ts\":%lld},");
-	string_builder_print(&_profile_output, fmt, (float64)count*1000, name, GetCurrentThreadId(), start*1000);
+	string_builder_print(&_profile_output, fmt, (float64)count*1000, name, context.thread_id, start*1000);
 	
 	spinlock_release(&_profiler_lock);
 }
--- a/oogabooga/range.c
+++ b/oogabooga/range.c
@ -3,6 +3,15 @@
 // I know that we'll have a Range2i at some point, so maybe it's better to be explicit for less confusion?
 // I'll leave this decision up to u charlie just delete this whenever u see it

+// charlie: 
+// Is this range stuff really necessary?
+// Why not just:
+// typedef Vector2 Range1f;
+// typedef Vector4 Range2f;
+// Vector4 also already have alias for x1, y1, x2, y2 and we could add an alias for min & max vectors (see linmath.c)
+// This feels like introducing unnecessary complexity and vocabulary when it's really just
+// another way to say Vector2 and Vector4.
+
 typedef struct Range1f {
  float min;
  float max;
--- a/oogabooga/tests.c
+++ b/oogabooga/tests.c
@ -1141,10 +1141,80 @@ void test_mutex() {

    mutex_destroy(&data.mutex);
 }
+
+int compare_draw_quads(const void *a, const void *b) {
+    return ((Draw_Quad*)a)->z-((Draw_Quad*)b)->z;
+}
+void test_sort() {
+    
+    int num_samples = 100;
+    u64 id_bits = 21;
+    u64 item_count = 5000;
+    
+    f64 seconds = 0;
+    u64 cycles = 0;
+    
+    Draw_Quad *items = alloc(get_heap_allocator(), (item_count * 2) * sizeof(Draw_Quad));
+    Draw_Quad *buffer = items + item_count;
+
+    for (int a = 0; a < num_samples; a++) {
+        
+        for (u64 i = 0; i < item_count; i++) {
+            if (i % 2 == 0) items[i].z = get_random_int_in_range(0, pow(2, id_bits) / 2);
+            else items[i].z = i;
+        }
+    
+        u64 item_size = sizeof(Draw_Quad);
+        u64 sort_value_offset_in_item = offsetof(Draw_Quad, z);
+    
+        float64 start_seconds = os_get_current_time_in_seconds();
+        u64 start_cycles = os_get_current_cycle_count();
+        radix_sort(items, buffer, item_count, item_size, sort_value_offset_in_item, id_bits);
+        u64 end_cycles = os_get_current_cycle_count();
+        float64 end_seconds = os_get_current_time_in_seconds();
+    
+        for (u64 i = 1; i < item_count; i++) {
+            assert(items[i].z >= items[i-1].z, "Failed: not correctly sorted");
+        }
+        
+        seconds += end_seconds - start_seconds;
+        cycles += end_cycles - start_cycles;
+    }
+    
+    print("Radix sort took on average %llu cycles and %.2f ms\n", cycles / num_samples, (seconds * 1000.0) / (float64)num_samples);
+
+	seconds = 0;
+    cycles = 0;
+	for (int a = 0; a < num_samples; a++) {
+        
+        for (u64 i = 0; i < item_count; i++) {
+            if (i % 2 == 0) items[i].z = get_random_int_in_range(0, pow(2, id_bits) / 2);
+            else items[i].z = i;
+        }
+    
+        u64 item_size = sizeof(Draw_Quad);
+        u64 sort_value_offset_in_item = offsetof(Draw_Quad, z);
+    
+        float64 start_seconds = os_get_current_time_in_seconds();
+        u64 start_cycles = os_get_current_cycle_count();
+        merge_sort(items, buffer, item_count, item_size, compare_draw_quads);
+        u64 end_cycles = os_get_current_cycle_count();
+        float64 end_seconds = os_get_current_time_in_seconds();
+    
+        for (u64 i = 1; i < item_count; i++) {
+            assert(items[i].z >= items[i-1].z, "Failed: not correctly sorted");
+        }
+        
+        seconds += end_seconds - start_seconds;
+        cycles += end_cycles - start_cycles;
+    }
+    
+    print("Merge sort took on average %llu cycles and %.2f ms\n", cycles / num_samples, (seconds * 1000.0) / (float64)num_samples);
+}
 void oogabooga_run_tests() {
 	
 	
-	print("Testing allocator... ");
+	/*print("Testing allocator... ");
 	test_allocator(true);
 	print("OK!\n");
 	
@ -1178,7 +1248,9 @@ void oogabooga_run_tests() {
 	
 	print("Testing mutex... ");
 	test_mutex();
+	print("OK!\n");*/
+	
+	print("Testing radix sort... ");
+	test_sort();
 	print("OK!\n");
-	
-	
 }
--- a/oogabooga/utility.c
+++ b/oogabooga/utility.c
@ -0,0 +1,103 @@
+
+
+
+// This is a very niche sort algorithm.
+// I use it for Z sorting quads.
+// help_buffer should be same size as collection.
+// This only works with integers, and it will use the first number_of_bits in the integer
+// at sort_value_offset_in_item for sorting.
+// There is a cost of memory as we need to double the buffer we're sorting BUT the performance
+// gain is very promising.
+// At 21 bits I'm able to sort a completely randomized collection of 100k integers at around
+// 8m cycles (or 2.5-2.6ms on my shitty laptop i5-11300H)
+void radix_sort(void *collection, void *help_buffer, u64 item_count, u64 item_size, u64 sort_value_offset_in_item, u64 number_of_bits) {
+
+    local_persist const int RADIX = 256;
+    local_persist const int BITS_PER_PASS = 8;
+    local_persist const int MASK = (RADIX - 1);
+    
+    const int PASS_COUNT = ((number_of_bits + BITS_PER_PASS - 1) / BITS_PER_PASS);
+    const u64 SIGN_SHIFT = 1ULL << (number_of_bits - 1);
+
+    u64* count = (u64*)alloc(get_temporary_allocator(), RADIX * sizeof(u64));
+    u8* items = (u8*)collection;
+    u8* buffer = (u8*)help_buffer;
+
+    for (u32 pass = 0; pass < PASS_COUNT; ++pass) {
+        u32 shift = pass * BITS_PER_PASS;
+
+        for (u32 i = 0; i < RADIX; ++i) {
+            count[i] = 0;
+        }
+
+        for (u64 i = 0; i < item_count; ++i) {
+            u64 sort_value = *(u64*)(items + i * item_size + sort_value_offset_in_item);
+            sort_value += SIGN_SHIFT;  // Transform the value to handle negative numbers
+            u32 digit = (sort_value >> shift) & MASK;
+            ++count[digit];
+        }
+
+        u64 sum = 0;
+        for (u32 i = 0; i < RADIX; ++i) {
+            u64 temp = count[i];
+            count[i] = sum;
+            sum += temp;
+        }
+
+        for (u64 i = 0; i < item_count; ++i) {
+            u64 sort_value = *(u64*)(items + i * item_size + sort_value_offset_in_item);
+            u64 transformed_value = sort_value + SIGN_SHIFT;  // Transform the value to handle negative numbers
+            u32 digit = (transformed_value >> shift) & MASK;
+            memcpy(buffer + count[digit] * item_size, items + i * item_size, item_size);
+            ++count[digit];
+        }
+
+        memcpy(items, buffer, item_count * item_size);
+    }
+
+    dealloc(get_temporary_allocator(), count);
+}
+
+void merge_sort(void *collection, void *help_buffer, u64 item_count, u64 item_size, int (*compare)(const void *, const void *)) {
+    u8 *items = (u8 *)collection;
+    u8 *buffer = (u8 *)help_buffer;
+
+    for (u64 width = 1; width < item_count; width *= 2) {
+        for (u64 i = 0; i < item_count; i += 2 * width) {
+            u64 left = i;
+            u64 right = (i + width < item_count) ? (i + width) : item_count;
+            u64 end = (i + 2 * width < item_count) ? (i + 2 * width) : item_count;
+
+            u64 left_index = left;
+            u64 right_index = right;
+            u64 k = left;
+
+            while (left_index < right && right_index < end) {
+                if (compare(items + left_index * item_size, items + right_index * item_size) <= 0) {
+                    memcpy(buffer + k * item_size, items + left_index * item_size, item_size);
+                    left_index++;
+                } else {
+                    memcpy(buffer + k * item_size, items + right_index * item_size, item_size);
+                    right_index++;
+                }
+                k++;
+            }
+
+            while (left_index < right) {
+                memcpy(buffer + k * item_size, items + left_index * item_size, item_size);
+                left_index++;
+                k++;
+            }
+
+            while (right_index < end) {
+                memcpy(buffer + k * item_size, items + right_index * item_size, item_size);
+                right_index++;
+                k++;
+            }
+
+            for (u64 j = left; j < end; j++) {
+                memcpy(items + j * item_size, buffer + j * item_size, item_size);
+            }
+        }
+    }
+}