diff --git a/build.c b/build.c index 0ee9d3a..71f59cd 100644 --- a/build.c +++ b/build.c @@ -3,6 +3,11 @@ /// // Build config stuff +// #Temporary +#define RUN_TESTS 0 +#define OOGABOOGA_DEV 1 +#define ENABLE_PROFILING 1 + #define INITIAL_PROGRAM_MEMORY_SIZE MB(5) typedef struct Context_Extra { @@ -26,11 +31,11 @@ typedef struct Context_Extra { // // this is a minimal starting point for new projects. Copy & rename to get started -#include "oogabooga/examples/minimal_game_loop.c" +// #include "oogabooga/examples/minimal_game_loop.c" // #include "oogabooga/examples/text_rendering.c" // #include "oogabooga/examples/custom_logger.c" -// #include "oogabooga/examples/renderer_stress_test.c" +#include "oogabooga/examples/renderer_stress_test.c" // This is where you swap in your own project! // #include "entry_yourepicgamename.c" diff --git a/changelog.txt b/changelog.txt index 1b2cc11..09eb19e 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,17 @@ +## v0.00.005 - Z layers +Renderer: + - Added optional Z-sorting + - Either set quad->z or call push_z_layer(s64) (and pop_z_layer()) + - Enable with draw_frame.enable_z_sorting = true + - Refactored the quad buffering to just be a growing quad buffer rather than a linked list of quad blocks. Your CPU will be thankful. + +Misc: + - removed gfx_impl_legacy_opengl.c + - Sorting procedures + - merge_sort() + - radix_sort() + - sorting tests + ## v0.00.004 - Custom logging, more concurrency & bugfixing Concurrency: diff --git a/oogabooga/base.c b/oogabooga/base.c index bcf81f0..d183794 100644 --- a/oogabooga/base.c +++ b/oogabooga/base.c @@ -78,6 +78,7 @@ typedef struct Allocator { } Allocator; Allocator get_heap_allocator(); +Allocator get_temporary_allocator(); typedef struct Context { void *logger; // void(*Logger_Proc)(Log_Level level, string fmt, ...) diff --git a/oogabooga/drawing.c b/oogabooga/drawing.c index 5fc7715..bfce8ff 100644 --- a/oogabooga/drawing.c +++ b/oogabooga/drawing.c @@ -62,13 +62,11 @@ Usage: */ +// We use radix sort so the exact bit count is of important +#define MAX_Z_BITS 21 +#define MAX_Z ((1 << MAX_Z_BITS)/2) +#define Z_STACK_MAX 4096 - - - - - -#define QUADS_PER_BLOCK 256 typedef struct Draw_Quad { Vector2 bottom_left, top_left, top_right, bottom_right; // r, g, b, a @@ -82,33 +80,22 @@ typedef struct Draw_Quad { Gfx_Filter_Mode image_min_filter; Gfx_Filter_Mode image_mag_filter; - float32 z; + s32 z; } Draw_Quad; -typedef struct Draw_Quad_Block { - Draw_Quad quad_buffer[QUADS_PER_BLOCK]; - u64 num_quads; - - float32 low_z, high_z; - - struct Draw_Quad_Block *next; -} Draw_Quad_Block; - -// I made these blocks part of the frame at first so they were temp allocated BUT I think -// that was a mistake because these blocks are accessed a lot so we want it to just be -// persistent memory that's super hot all the time. -Draw_Quad_Block first_block = {0}; - +Draw_Quad *quad_buffer; +u64 allocated_quads; typedef struct Draw_Frame { - Draw_Quad_Block *current; - u64 num_blocks; + u64 num_quads; Matrix4 projection; Matrix4 view; bool enable_z_sorting; + s32 z_stack[Z_STACK_MAX]; + u64 z_count; } Draw_Frame; // This frame is passed to the platform layer and rendered in os_update. // Resets every frame. @@ -117,14 +104,21 @@ Draw_Frame draw_frame = ZERO(Draw_Frame); void reset_draw_frame(Draw_Frame *frame) { *frame = (Draw_Frame){0}; - frame->current = 0; - float32 aspect = (float32)window.width/(float32)window.height; frame->projection = m4_make_orthographic_projection(-aspect, aspect, -1, 1, -1, 10); - frame->view = m4_scalar(1.0); + frame->view = m4_scalar(1.0); +} + +void push_z_layer(s32 z) { + assert(draw_frame.z_count < Z_STACK_MAX, "Too many z layers pushed. You can pop with pop_z_layer() when you are done drawing to it."); - frame->num_blocks = 0; + draw_frame.z_stack[draw_frame.z_count] = z; + draw_frame.z_count += 1; +} +void pop_z_layer() { + assert(draw_frame.z_count > 0, "No Z layers to pop!"); + draw_frame.z_count -= 1; } Draw_Quad *draw_quad_projected(Draw_Quad quad, Matrix4 world_to_clip) { @@ -135,37 +129,30 @@ Draw_Quad *draw_quad_projected(Draw_Quad quad, Matrix4 world_to_clip) { quad.image_min_filter = GFX_FILTER_MODE_NEAREST; quad.image_mag_filter = GFX_FILTER_MODE_NEAREST; - - if (!draw_frame.current) { - draw_frame.current = &first_block; - draw_frame.current->low_z = F32_MAX; - draw_frame.current->high_z = F32_MIN; - draw_frame.current->num_quads = 0; - draw_frame.num_blocks = 1; - } - assert(draw_frame.current->num_quads <= QUADS_PER_BLOCK); + quad.z = 0; + if (draw_frame.z_count > 0) quad.z = draw_frame.z_stack[draw_frame.z_count-1]; - if (draw_frame.current->num_quads == QUADS_PER_BLOCK) { + if (draw_frame.num_quads >= allocated_quads) { + // #Memory - if (!draw_frame.current->next) { - draw_frame.current->next = cast(Draw_Quad_Block*)alloc(get_heap_allocator(), sizeof(Draw_Quad_Block)); - *draw_frame.current->next = ZERO(Draw_Quad_Block); + u64 new_count = max(get_next_power_of_two(draw_frame.num_quads+1), 128); + + Draw_Quad *new_buffer = alloc(get_heap_allocator(), new_count*sizeof(Draw_Quad)); + + if (quad_buffer) { + memcpy(new_buffer, quad_buffer, draw_frame.num_quads*sizeof(Draw_Quad)); + dealloc(get_heap_allocator(), quad_buffer); } - draw_frame.current = draw_frame.current->next; - draw_frame.current->num_quads = 0; - draw_frame.current->low_z = F32_MAX; - draw_frame.current->high_z = F32_MIN; - - draw_frame.num_blocks += 1; - + quad_buffer = new_buffer; + allocated_quads = new_count; } - draw_frame.current->quad_buffer[draw_frame.current->num_quads] = quad; - draw_frame.current->num_quads += 1; + quad_buffer[draw_frame.num_quads] = quad; + draw_frame.num_quads += 1; - return &draw_frame.current->quad_buffer[draw_frame.current->num_quads-1]; + return &quad_buffer[draw_frame.num_quads-1]; } Draw_Quad *draw_quad(Draw_Quad quad) { return draw_quad_projected(quad, m4_mul(draw_frame.projection, m4_inverse(draw_frame.view))); diff --git a/oogabooga/examples/minimal_game_loop.c b/oogabooga/examples/minimal_game_loop.c index 0e20b3d..40f232c 100644 --- a/oogabooga/examples/minimal_game_loop.c +++ b/oogabooga/examples/minimal_game_loop.c @@ -19,6 +19,8 @@ int entry(int argc, char **argv) { rect_xform = m4_translate(rect_xform, v3(-.25f, -.25f, 0)); draw_rect_xform(rect_xform, v2(.5f, .5f), COLOR_GREEN); + draw_rect(v2(sin(now), -.8), v2(.5, .25), COLOR_RED); + gfx_update(); } diff --git a/oogabooga/examples/renderer_stress_test.c b/oogabooga/examples/renderer_stress_test.c index d8f90a9..e257251 100644 --- a/oogabooga/examples/renderer_stress_test.c +++ b/oogabooga/examples/renderer_stress_test.c @@ -87,8 +87,12 @@ int entry(int argc, char **argv) { camera_view = m4_translate(camera_view, v3(v2_expand(cam_move), 0)); draw_frame.view = camera_view; + local_persist bool do_enable_z_sorting = false; + draw_frame.enable_z_sorting = do_enable_z_sorting; + if (is_key_just_pressed('Z')) do_enable_z_sorting = !do_enable_z_sorting; + seed_for_random = 69; - for (u64 i = 0; i < 100000; i++) { + for (u64 i = 0; i < 50000; i++) { float32 aspect = (float32)window.width/(float32)window.height; float min_x = -aspect; float max_x = aspect; @@ -98,15 +102,18 @@ int entry(int argc, char **argv) { float x = get_random_float32() * (max_x-min_x) + min_x; float y = get_random_float32() * (max_y-min_y) + min_y; + push_z_layer((s32)(y*100)); draw_image(bush_image, v2(x, y), v2(0.1, 0.1), COLOR_WHITE); + pop_z_layer(); } seed_for_random = os_get_current_cycle_count(); - Matrix4 hammer_xform = m4_scalar(1.0); hammer_xform = m4_rotate_z(hammer_xform, (f32)now); hammer_xform = m4_translate(hammer_xform, v3(-.25f, -.25f, 0)); + push_z_layer(1000001); draw_image_xform(hammer_image, hammer_xform, v2(.5f, .5f), COLOR_RED); + pop_z_layer(); Vector2 hover_position = v2_rotate_point_around_pivot(v2(-.5, -.5), v2(0, 0), (f32)now); Vector2 local_pivot = v2(.125f, .125f); diff --git a/oogabooga/gfx_impl_d3d11.c b/oogabooga/gfx_impl_d3d11.c index 1d2c0c3..06c079c 100644 --- a/oogabooga/gfx_impl_d3d11.c +++ b/oogabooga/gfx_impl_d3d11.c @@ -15,7 +15,6 @@ string temp_win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16); typedef struct alignat(16) D3D11_Vertex { - Vector4 color; Vector4 position; Vector2 uv; @@ -59,6 +58,9 @@ ID3D11Buffer *d3d11_quad_vbo = 0; u32 d3d11_quad_vbo_size = 0; void *d3d11_staging_quad_buffer = 0; +Draw_Quad *sort_quad_buffer = 0; +u64 sort_quad_buffer_size = 0; + const char* d3d11_stringify_category(D3D11_MESSAGE_CATEGORY category) { switch (category) { case D3D11_MESSAGE_CATEGORY_APPLICATION_DEFINED: return "Application Defined"; @@ -545,7 +547,7 @@ void d3d11_process_draw_frame() { /// // Maybe grow quad vbo - u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6; + u32 required_size = sizeof(D3D11_Vertex) * allocated_quads*6; if (required_size > d3d11_quad_vbo_size) { if (d3d11_quad_vbo) { @@ -567,7 +569,7 @@ void d3d11_process_draw_frame() { log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size); } - if (draw_frame.num_blocks > 0) { + if (draw_frame.num_quads > 0) { /// // Render geometry from into vbo quad list @@ -580,120 +582,126 @@ void d3d11_process_draw_frame() { D3D11_Vertex* head = (D3D11_Vertex*)d3d11_staging_quad_buffer; D3D11_Vertex* pointer = head; u64 number_of_rendered_quads = 0; - Draw_Quad_Block *block = &first_block; tm_scope_cycles("Quad processing") { - u64 block_index = 0; - while (block != 0 && block->num_quads > 0 && block_index < draw_frame.num_blocks) tm_scope_cycles("Quad block") { - for (u64 i = 0; i < block->num_quads; i++) { + if (draw_frame.enable_z_sorting) tm_scope_cycles("Z sorting") { + if (!sort_quad_buffer || (sort_quad_buffer_size < allocated_quads*sizeof(Draw_Quad))) { + // #Memory #Heapalloc + if (sort_quad_buffer) dealloc(get_heap_allocator(), sort_quad_buffer); + sort_quad_buffer = alloc(get_heap_allocator(), allocated_quads*sizeof(Draw_Quad)); + sort_quad_buffer_size = allocated_quads*sizeof(Draw_Quad); + } + radix_sort(quad_buffer, sort_quad_buffer, draw_frame.num_quads, sizeof(Draw_Quad), offsetof(Draw_Quad, z), MAX_Z_BITS); + } + + for (u64 i = 0; i < draw_frame.num_quads; i++) { + + Draw_Quad *q = &quad_buffer[i]; + + assert(q->z <= MAX_Z, "Z is too high. Z is %d, Max is %d.", q->z, MAX_Z); + assert(q->z >= (-MAX_Z+1), "Z is too low. Z is %d, Min is %d.", q->z, -MAX_Z+1); + + s8 texture_index = -1; + + if (q->image) { - Draw_Quad *q = &block->quad_buffer[i]; - - s8 texture_index = -1; - - if (q->image) { - - if (last_texture == q->image->gfx_handle) { - texture_index = last_texture_index; - } else { - // First look if texture is already bound - for (u64 j = 0; j < num_textures; j++) { - if (textures[j] == q->image->gfx_handle) { - texture_index = (s8)j; - break; - } - } - // Otherwise use a new slot - if (texture_index <= -1) { - if (num_textures >= 32) { - // If max textures reached, make a draw call and start over - D3D11_MAPPED_SUBRESOURCE buffer_mapping; - VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping); - memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6); - VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0); - d3d11_draw_call(number_of_rendered_quads, textures, num_textures); - head = (D3D11_Vertex*)d3d11_staging_quad_buffer; - num_textures = 0; - texture_index = 0; - number_of_rendered_quads = 0; - pointer = head; - } else { - texture_index = (s8)num_textures; - num_textures += 1; - } + if (last_texture == q->image->gfx_handle) { + texture_index = last_texture_index; + } else { + // First look if texture is already bound + for (u64 j = 0; j < num_textures; j++) { + if (textures[j] == q->image->gfx_handle) { + texture_index = (s8)j; + break; + } + } + // Otherwise use a new slot + if (texture_index <= -1) { + if (num_textures >= 32) { + // If max textures reached, make a draw call and start over + D3D11_MAPPED_SUBRESOURCE buffer_mapping; + VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping); + memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6); + VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0); + d3d11_draw_call(number_of_rendered_quads, textures, num_textures); + head = (D3D11_Vertex*)d3d11_staging_quad_buffer; + num_textures = 0; + texture_index = 0; + number_of_rendered_quads = 0; + pointer = head; + } else { + texture_index = (s8)num_textures; + num_textures += 1; } } - textures[texture_index] = q->image->gfx_handle; - last_texture = q->image->gfx_handle; - last_texture_index = texture_index; - } - - if (q->type == QUAD_TYPE_TEXT) { - float pixel_width = 2.0/(float)window.width; - float pixel_height = 2.0/(float)window.height; - - q->bottom_left.x = round(q->bottom_left.x / pixel_width) * pixel_width; - q->bottom_left.y = round(q->bottom_left.y / pixel_height) * pixel_height; - q->top_left.x = round(q->top_left.x / pixel_width) * pixel_width; - q->top_left.y = round(q->top_left.y / pixel_height) * pixel_height; - q->top_right.x = round(q->top_right.x / pixel_width) * pixel_width; - q->top_right.y = round(q->top_right.y / pixel_height) * pixel_height; - q->bottom_right.x = round(q->bottom_right.x / pixel_width) * pixel_width; - q->bottom_right.y = round(q->bottom_right.y / pixel_height) * pixel_height; - } - - // We will write to 6 vertices for the one quad (two tris) - { - - D3D11_Vertex* BL = pointer + 0; - D3D11_Vertex* TL = pointer + 1; - D3D11_Vertex* TR = pointer + 2; - D3D11_Vertex* BL2 = pointer + 3; - D3D11_Vertex* TR2 = pointer + 4; - D3D11_Vertex* BR = pointer + 5; - pointer += 6; - - BL->position = v4(q->bottom_left.x, q->bottom_left.y, 0, 1); - TL->position = v4(q->top_left.x, q->top_left.y, 0, 1); - TR->position = v4(q->top_right.x, q->top_right.y, 0, 1); - BR->position = v4(q->bottom_right.x, q->bottom_right.y, 0, 1); - - BL->uv = v2(q->uv.x1, q->uv.y1); - TL->uv = v2(q->uv.x1, q->uv.y2); - TR->uv = v2(q->uv.x2, q->uv.y2); - BR->uv = v2(q->uv.x2, q->uv.y1); - - BL->color = TL->color = TR->color = BR->color = q->color; - - BL->texture_index=TL->texture_index=TR->texture_index=BR->texture_index = texture_index; - BL->type=TL->type=TR->type=BR->type = (u8)q->type; - - u8 sampler = -1; - if (q->image_min_filter == GFX_FILTER_MODE_NEAREST - && q->image_mag_filter == GFX_FILTER_MODE_NEAREST) - sampler = 0; - if (q->image_min_filter == GFX_FILTER_MODE_LINEAR - && q->image_mag_filter == GFX_FILTER_MODE_LINEAR) - sampler = 1; - if (q->image_min_filter == GFX_FILTER_MODE_LINEAR - && q->image_mag_filter == GFX_FILTER_MODE_NEAREST) - sampler = 2; - if (q->image_min_filter == GFX_FILTER_MODE_NEAREST - && q->image_mag_filter == GFX_FILTER_MODE_LINEAR) - sampler = 3; - - BL->type=TL->type=TR->type=BR->type = (u8)q->type; - BL->sampler=TL->sampler=TR->sampler=BR->sampler = (u8)sampler; - - *BL2 = *BL; - *TR2 = *TR; - - number_of_rendered_quads += 1; } + textures[texture_index] = q->image->gfx_handle; + last_texture = q->image->gfx_handle; + last_texture_index = texture_index; } - block_index += 1; - block = block->next; + if (q->type == QUAD_TYPE_TEXT) { + float pixel_width = 2.0/(float)window.width; + float pixel_height = 2.0/(float)window.height; + + q->bottom_left.x = round(q->bottom_left.x / pixel_width) * pixel_width; + q->bottom_left.y = round(q->bottom_left.y / pixel_height) * pixel_height; + q->top_left.x = round(q->top_left.x / pixel_width) * pixel_width; + q->top_left.y = round(q->top_left.y / pixel_height) * pixel_height; + q->top_right.x = round(q->top_right.x / pixel_width) * pixel_width; + q->top_right.y = round(q->top_right.y / pixel_height) * pixel_height; + q->bottom_right.x = round(q->bottom_right.x / pixel_width) * pixel_width; + q->bottom_right.y = round(q->bottom_right.y / pixel_height) * pixel_height; + } + + // We will write to 6 vertices for the one quad (two tris) + { + + D3D11_Vertex* BL = pointer + 0; + D3D11_Vertex* TL = pointer + 1; + D3D11_Vertex* TR = pointer + 2; + D3D11_Vertex* BL2 = pointer + 3; + D3D11_Vertex* TR2 = pointer + 4; + D3D11_Vertex* BR = pointer + 5; + pointer += 6; + + BL->position = v4(q->bottom_left.x, q->bottom_left.y, 0, 1); + TL->position = v4(q->top_left.x, q->top_left.y, 0, 1); + TR->position = v4(q->top_right.x, q->top_right.y, 0, 1); + BR->position = v4(q->bottom_right.x, q->bottom_right.y, 0, 1); + + BL->uv = v2(q->uv.x1, q->uv.y1); + TL->uv = v2(q->uv.x1, q->uv.y2); + TR->uv = v2(q->uv.x2, q->uv.y2); + BR->uv = v2(q->uv.x2, q->uv.y1); + + BL->color = TL->color = TR->color = BR->color = q->color; + + BL->texture_index=TL->texture_index=TR->texture_index=BR->texture_index = texture_index; + BL->type=TL->type=TR->type=BR->type = (u8)q->type; + + u8 sampler = -1; + if (q->image_min_filter == GFX_FILTER_MODE_NEAREST + && q->image_mag_filter == GFX_FILTER_MODE_NEAREST) + sampler = 0; + if (q->image_min_filter == GFX_FILTER_MODE_LINEAR + && q->image_mag_filter == GFX_FILTER_MODE_LINEAR) + sampler = 1; + if (q->image_min_filter == GFX_FILTER_MODE_LINEAR + && q->image_mag_filter == GFX_FILTER_MODE_NEAREST) + sampler = 2; + if (q->image_min_filter == GFX_FILTER_MODE_NEAREST + && q->image_mag_filter == GFX_FILTER_MODE_LINEAR) + sampler = 3; + + BL->type=TL->type=TR->type=BR->type = (u8)q->type; + BL->sampler=TL->sampler=TR->sampler=BR->sampler = (u8)sampler; + + *BL2 = *BL; + *TR2 = *TR; + + number_of_rendered_quads += 1; + } } } @@ -738,7 +746,9 @@ void gfx_update() { d3d11_process_draw_frame(); - VTABLE(Present, d3d11_swap_chain, window.enable_vsync, window.enable_vsync ? 0 : DXGI_PRESENT_ALLOW_TEARING); + tm_scope_cycles("Present") { + VTABLE(Present, d3d11_swap_chain, window.enable_vsync, window.enable_vsync ? 0 : DXGI_PRESENT_ALLOW_TEARING); + } #if CONFIGURATION == DEBUG diff --git a/oogabooga/gfx_impl_legacy_opengl.c b/oogabooga/gfx_impl_legacy_opengl.c deleted file mode 100644 index ae047b6..0000000 --- a/oogabooga/gfx_impl_legacy_opengl.c +++ /dev/null @@ -1,80 +0,0 @@ - -// #Temporary #Cleanup -// #Temporary #Cleanup -// #Temporary #Cleanup -// #Temporary #Cleanup -#include "GL/gl.h" -HDC hdc; -typedef BOOL (APIENTRY *PFNWGLSWAPINTERVALEXTPROC) (int interval); - -const Gfx_Handle GFX_INVALID_HANDLE = 0; - -void gfx_init() { - // #Temporary #Cleanup - // #Temporary #Cleanup - // #Temporary #Cleanup - // #Temporary #Cleanup - - PIXELFORMATDESCRIPTOR pfd = { - sizeof(PIXELFORMATDESCRIPTOR), - 1, - PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER, - PFD_TYPE_RGBA, - 32, - 0, 0, 0, 0, 0, 0, - 0, 0, - 0, 0, 0, 0, 0, - 24, - 8, - 0, - PFD_MAIN_PLANE, - 0, - 0, 0, 0 - }; - - hdc = GetDC(window._os_handle); - int pixelFormat = ChoosePixelFormat(hdc, &pfd); - SetPixelFormat(hdc, pixelFormat, &pfd); - - HGLRC hglrc = wglCreateContext(hdc); - wglMakeCurrent(hdc, hglrc); - - PFNWGLSWAPINTERVALEXTPROC wglSwapIntervalEXT = (PFNWGLSWAPINTERVALEXTPROC) wglGetProcAddress("wglSwapIntervalEXT"); - - assert(wglSwapIntervalEXT, "Could not load wglSwapIntervalEXT"); - - wglSwapIntervalEXT(0); - - glEnable(GL_BLEND); - glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); -} - -void gfx_update() { - - Draw_Quad_Block *block = &draw_frame.first_block; - glBegin(GL_QUADS); - while (block != 0) { - - for (u64 i = 0; i < block->num_quads; i++) { - Draw_Quad q = block->quad_buffer[i]; - - - glColor4f(v4_expand(q.color)); - glVertex2f(v2_expand(q.bottom_left)); - glVertex2f(v2_expand(q.top_left)); - glVertex2f(v2_expand(q.top_right)); - glVertex2f(v2_expand(q.bottom_right)); - - } - - block = block->next; - } - glEnd(); - - SwapBuffers(hdc); - glClearColor(window.clear_color.r, window.clear_color.g, window.clear_color.b, window.clear_color.a); - glClear(GL_COLOR_BUFFER_BIT); - glViewport(0, 0, window.width, window.height); - - draw_frame = ZERO(Draw_Frame); -} \ No newline at end of file diff --git a/oogabooga/memory.c b/oogabooga/memory.c index 4141712..25d7b3c 100644 --- a/oogabooga/memory.c +++ b/oogabooga/memory.c @@ -489,6 +489,9 @@ thread_local void * temporary_storage_pointer = 0; thread_local bool has_warned_temporary_storage_overflow = false; thread_local Allocator temp; +Allocator get_temporary_allocator() { + return temp; +} void* temp_allocator_proc(u64 size, void *p, Allocator_Message message, void* data) { switch (message) { diff --git a/oogabooga/oogabooga.c b/oogabooga/oogabooga.c index 8df859a..9d54a6e 100644 --- a/oogabooga/oogabooga.c +++ b/oogabooga/oogabooga.c @@ -107,7 +107,7 @@ #define OGB_VERSION_MAJOR 0 #define OGB_VERSION_MINOR 0 -#define OGB_VERSION_PATCH 4 +#define OGB_VERSION_PATCH 5 #define OGB_VERSION (OGB_VERSION_MAJOR*1000000+OGB_VERSION_MINOR*1000+OGB_VERSION_PATCH) @@ -266,6 +266,7 @@ typedef u8 bool; #include "path_utils.c" #include "linmath.c" #include "range.c" +#include "utility.c" #include "hash_table.c" @@ -292,8 +293,6 @@ typedef u8 bool; #error "We only have a D3D11 renderer at the moment" #elif GFX_RENDERER == GFX_RENDERER_METAL #error "We only have a D3D11 renderer at the moment" -#elif GFX_RENDERER == GFX_RENDERER_LEGACY_OPENGL - #include "gfx_impl_legacy_opengl.c" #else #error "Unknown renderer GFX_RENDERER defined" #endif diff --git a/oogabooga/os_impl_windows.c b/oogabooga/os_impl_windows.c index c432dbe..bb5a718 100644 --- a/oogabooga/os_impl_windows.c +++ b/oogabooga/os_impl_windows.c @@ -139,7 +139,6 @@ void os_init(u64 program_memory_size) { memset(&window, 0, sizeof(window)); - timeBeginPeriod(1); #if CONFIGURATION == RELEASE SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); #endif @@ -364,7 +363,6 @@ bool os_grow_program_memory(u64 new_size) { DWORD WINAPI win32_thread_invoker(LPVOID param) { - timeBeginPeriod(1); #if CONFIGURATION == RELEASE SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); #endif @@ -518,13 +516,15 @@ void os_high_precision_sleep(f64 ms) { s32 sleep_time = (s32)((end-start)-1.0); bool do_sleep = sleep_time >= 1; - timeBeginPeriod(1); // I don't see a reason to reset this + timeBeginPeriod(1); if (do_sleep) os_sleep(sleep_time); while (os_get_current_time_in_seconds() < end) { os_yield_thread(); } + + timeEndPeriod(1); } diff --git a/oogabooga/profiling.c b/oogabooga/profiling.c index 086cd7f..7c837e8 100644 --- a/oogabooga/profiling.c +++ b/oogabooga/profiling.c @@ -24,7 +24,7 @@ void _profiler_report_time_cycles(string name, u64 count, u64 start) { spinlock_acquire_or_wait(&_profiler_lock); string fmt = STR("{\"cat\":\"function\",\"dur\":%.3f,\"name\":\"%s\",\"ph\":\"X\",\"pid\":0,\"tid\":%zu,\"ts\":%lld},"); - string_builder_print(&_profile_output, fmt, (float64)count*1000, name, GetCurrentThreadId(), start*1000); + string_builder_print(&_profile_output, fmt, (float64)count*1000, name, context.thread_id, start*1000); spinlock_release(&_profiler_lock); } diff --git a/oogabooga/range.c b/oogabooga/range.c index b20c815..f5983a5 100644 --- a/oogabooga/range.c +++ b/oogabooga/range.c @@ -3,6 +3,15 @@ // I know that we'll have a Range2i at some point, so maybe it's better to be explicit for less confusion? // I'll leave this decision up to u charlie just delete this whenever u see it +// charlie: +// Is this range stuff really necessary? +// Why not just: +// typedef Vector2 Range1f; +// typedef Vector4 Range2f; +// Vector4 also already have alias for x1, y1, x2, y2 and we could add an alias for min & max vectors (see linmath.c) +// This feels like introducing unnecessary complexity and vocabulary when it's really just +// another way to say Vector2 and Vector4. + typedef struct Range1f { float min; float max; diff --git a/oogabooga/tests.c b/oogabooga/tests.c index e97dc3c..3d15322 100644 --- a/oogabooga/tests.c +++ b/oogabooga/tests.c @@ -1141,10 +1141,80 @@ void test_mutex() { mutex_destroy(&data.mutex); } + +int compare_draw_quads(const void *a, const void *b) { + return ((Draw_Quad*)a)->z-((Draw_Quad*)b)->z; +} +void test_sort() { + + int num_samples = 100; + u64 id_bits = 21; + u64 item_count = 5000; + + f64 seconds = 0; + u64 cycles = 0; + + Draw_Quad *items = alloc(get_heap_allocator(), (item_count * 2) * sizeof(Draw_Quad)); + Draw_Quad *buffer = items + item_count; + + for (int a = 0; a < num_samples; a++) { + + for (u64 i = 0; i < item_count; i++) { + if (i % 2 == 0) items[i].z = get_random_int_in_range(0, pow(2, id_bits) / 2); + else items[i].z = i; + } + + u64 item_size = sizeof(Draw_Quad); + u64 sort_value_offset_in_item = offsetof(Draw_Quad, z); + + float64 start_seconds = os_get_current_time_in_seconds(); + u64 start_cycles = os_get_current_cycle_count(); + radix_sort(items, buffer, item_count, item_size, sort_value_offset_in_item, id_bits); + u64 end_cycles = os_get_current_cycle_count(); + float64 end_seconds = os_get_current_time_in_seconds(); + + for (u64 i = 1; i < item_count; i++) { + assert(items[i].z >= items[i-1].z, "Failed: not correctly sorted"); + } + + seconds += end_seconds - start_seconds; + cycles += end_cycles - start_cycles; + } + + print("Radix sort took on average %llu cycles and %.2f ms\n", cycles / num_samples, (seconds * 1000.0) / (float64)num_samples); + + seconds = 0; + cycles = 0; + for (int a = 0; a < num_samples; a++) { + + for (u64 i = 0; i < item_count; i++) { + if (i % 2 == 0) items[i].z = get_random_int_in_range(0, pow(2, id_bits) / 2); + else items[i].z = i; + } + + u64 item_size = sizeof(Draw_Quad); + u64 sort_value_offset_in_item = offsetof(Draw_Quad, z); + + float64 start_seconds = os_get_current_time_in_seconds(); + u64 start_cycles = os_get_current_cycle_count(); + merge_sort(items, buffer, item_count, item_size, compare_draw_quads); + u64 end_cycles = os_get_current_cycle_count(); + float64 end_seconds = os_get_current_time_in_seconds(); + + for (u64 i = 1; i < item_count; i++) { + assert(items[i].z >= items[i-1].z, "Failed: not correctly sorted"); + } + + seconds += end_seconds - start_seconds; + cycles += end_cycles - start_cycles; + } + + print("Merge sort took on average %llu cycles and %.2f ms\n", cycles / num_samples, (seconds * 1000.0) / (float64)num_samples); +} void oogabooga_run_tests() { - print("Testing allocator... "); + /*print("Testing allocator... "); test_allocator(true); print("OK!\n"); @@ -1178,7 +1248,9 @@ void oogabooga_run_tests() { print("Testing mutex... "); test_mutex(); + print("OK!\n");*/ + + print("Testing radix sort... "); + test_sort(); print("OK!\n"); - - } \ No newline at end of file diff --git a/oogabooga/utility.c b/oogabooga/utility.c new file mode 100644 index 0000000..6f4bbae --- /dev/null +++ b/oogabooga/utility.c @@ -0,0 +1,103 @@ + + + +// This is a very niche sort algorithm. +// I use it for Z sorting quads. +// help_buffer should be same size as collection. +// This only works with integers, and it will use the first number_of_bits in the integer +// at sort_value_offset_in_item for sorting. +// There is a cost of memory as we need to double the buffer we're sorting BUT the performance +// gain is very promising. +// At 21 bits I'm able to sort a completely randomized collection of 100k integers at around +// 8m cycles (or 2.5-2.6ms on my shitty laptop i5-11300H) +void radix_sort(void *collection, void *help_buffer, u64 item_count, u64 item_size, u64 sort_value_offset_in_item, u64 number_of_bits) { + + local_persist const int RADIX = 256; + local_persist const int BITS_PER_PASS = 8; + local_persist const int MASK = (RADIX - 1); + + const int PASS_COUNT = ((number_of_bits + BITS_PER_PASS - 1) / BITS_PER_PASS); + const u64 SIGN_SHIFT = 1ULL << (number_of_bits - 1); + + u64* count = (u64*)alloc(get_temporary_allocator(), RADIX * sizeof(u64)); + u8* items = (u8*)collection; + u8* buffer = (u8*)help_buffer; + + for (u32 pass = 0; pass < PASS_COUNT; ++pass) { + u32 shift = pass * BITS_PER_PASS; + + for (u32 i = 0; i < RADIX; ++i) { + count[i] = 0; + } + + for (u64 i = 0; i < item_count; ++i) { + u64 sort_value = *(u64*)(items + i * item_size + sort_value_offset_in_item); + sort_value += SIGN_SHIFT; // Transform the value to handle negative numbers + u32 digit = (sort_value >> shift) & MASK; + ++count[digit]; + } + + u64 sum = 0; + for (u32 i = 0; i < RADIX; ++i) { + u64 temp = count[i]; + count[i] = sum; + sum += temp; + } + + for (u64 i = 0; i < item_count; ++i) { + u64 sort_value = *(u64*)(items + i * item_size + sort_value_offset_in_item); + u64 transformed_value = sort_value + SIGN_SHIFT; // Transform the value to handle negative numbers + u32 digit = (transformed_value >> shift) & MASK; + memcpy(buffer + count[digit] * item_size, items + i * item_size, item_size); + ++count[digit]; + } + + memcpy(items, buffer, item_count * item_size); + } + + dealloc(get_temporary_allocator(), count); +} + +void merge_sort(void *collection, void *help_buffer, u64 item_count, u64 item_size, int (*compare)(const void *, const void *)) { + u8 *items = (u8 *)collection; + u8 *buffer = (u8 *)help_buffer; + + for (u64 width = 1; width < item_count; width *= 2) { + for (u64 i = 0; i < item_count; i += 2 * width) { + u64 left = i; + u64 right = (i + width < item_count) ? (i + width) : item_count; + u64 end = (i + 2 * width < item_count) ? (i + 2 * width) : item_count; + + u64 left_index = left; + u64 right_index = right; + u64 k = left; + + while (left_index < right && right_index < end) { + if (compare(items + left_index * item_size, items + right_index * item_size) <= 0) { + memcpy(buffer + k * item_size, items + left_index * item_size, item_size); + left_index++; + } else { + memcpy(buffer + k * item_size, items + right_index * item_size, item_size); + right_index++; + } + k++; + } + + while (left_index < right) { + memcpy(buffer + k * item_size, items + left_index * item_size, item_size); + left_index++; + k++; + } + + while (right_index < end) { + memcpy(buffer + k * item_size, items + right_index * item_size, item_size); + right_index++; + k++; + } + + for (u64 j = left; j < end; j++) { + memcpy(items + j * item_size, buffer + j * item_size, item_size); + } + } + } +}