- Z Layers

- Sorting
- Contiguous quad buffer
This commit is contained in:
Charlie 2024-07-10 17:10:38 +02:00
parent a9cbf7ee68
commit c92b6fd4b7
15 changed files with 386 additions and 254 deletions

View file

@ -3,6 +3,11 @@
///
// Build config stuff
// #Temporary
#define RUN_TESTS 0
#define OOGABOOGA_DEV 1
#define ENABLE_PROFILING 1
#define INITIAL_PROGRAM_MEMORY_SIZE MB(5)
typedef struct Context_Extra {
@ -26,11 +31,11 @@ typedef struct Context_Extra {
//
// this is a minimal starting point for new projects. Copy & rename to get started
#include "oogabooga/examples/minimal_game_loop.c"
// #include "oogabooga/examples/minimal_game_loop.c"
// #include "oogabooga/examples/text_rendering.c"
// #include "oogabooga/examples/custom_logger.c"
// #include "oogabooga/examples/renderer_stress_test.c"
#include "oogabooga/examples/renderer_stress_test.c"
// This is where you swap in your own project!
// #include "entry_yourepicgamename.c"

View file

@ -1,3 +1,17 @@
## v0.00.005 - Z layers
Renderer:
- Added optional Z-sorting
- Either set quad->z or call push_z_layer(s64) (and pop_z_layer())
- Enable with draw_frame.enable_z_sorting = true
- Refactored the quad buffering to just be a growing quad buffer rather than a linked list of quad blocks. Your CPU will be thankful.
Misc:
- removed gfx_impl_legacy_opengl.c
- Sorting procedures
- merge_sort()
- radix_sort()
- sorting tests
## v0.00.004 - Custom logging, more concurrency & bugfixing
Concurrency:

View file

@ -78,6 +78,7 @@ typedef struct Allocator {
} Allocator;
Allocator get_heap_allocator();
Allocator get_temporary_allocator();
typedef struct Context {
void *logger; // void(*Logger_Proc)(Log_Level level, string fmt, ...)

View file

@ -62,13 +62,11 @@ Usage:
*/
// We use radix sort so the exact bit count is of important
#define MAX_Z_BITS 21
#define MAX_Z ((1 << MAX_Z_BITS)/2)
#define Z_STACK_MAX 4096
#define QUADS_PER_BLOCK 256
typedef struct Draw_Quad {
Vector2 bottom_left, top_left, top_right, bottom_right;
// r, g, b, a
@ -82,33 +80,22 @@ typedef struct Draw_Quad {
Gfx_Filter_Mode image_min_filter;
Gfx_Filter_Mode image_mag_filter;
float32 z;
s32 z;
} Draw_Quad;
typedef struct Draw_Quad_Block {
Draw_Quad quad_buffer[QUADS_PER_BLOCK];
u64 num_quads;
float32 low_z, high_z;
struct Draw_Quad_Block *next;
} Draw_Quad_Block;
// I made these blocks part of the frame at first so they were temp allocated BUT I think
// that was a mistake because these blocks are accessed a lot so we want it to just be
// persistent memory that's super hot all the time.
Draw_Quad_Block first_block = {0};
Draw_Quad *quad_buffer;
u64 allocated_quads;
typedef struct Draw_Frame {
Draw_Quad_Block *current;
u64 num_blocks;
u64 num_quads;
Matrix4 projection;
Matrix4 view;
bool enable_z_sorting;
s32 z_stack[Z_STACK_MAX];
u64 z_count;
} Draw_Frame;
// This frame is passed to the platform layer and rendered in os_update.
// Resets every frame.
@ -117,14 +104,21 @@ Draw_Frame draw_frame = ZERO(Draw_Frame);
void reset_draw_frame(Draw_Frame *frame) {
*frame = (Draw_Frame){0};
frame->current = 0;
float32 aspect = (float32)window.width/(float32)window.height;
frame->projection = m4_make_orthographic_projection(-aspect, aspect, -1, 1, -1, 10);
frame->view = m4_scalar(1.0);
}
frame->num_blocks = 0;
void push_z_layer(s32 z) {
assert(draw_frame.z_count < Z_STACK_MAX, "Too many z layers pushed. You can pop with pop_z_layer() when you are done drawing to it.");
draw_frame.z_stack[draw_frame.z_count] = z;
draw_frame.z_count += 1;
}
void pop_z_layer() {
assert(draw_frame.z_count > 0, "No Z layers to pop!");
draw_frame.z_count -= 1;
}
Draw_Quad *draw_quad_projected(Draw_Quad quad, Matrix4 world_to_clip) {
@ -136,36 +130,29 @@ Draw_Quad *draw_quad_projected(Draw_Quad quad, Matrix4 world_to_clip) {
quad.image_min_filter = GFX_FILTER_MODE_NEAREST;
quad.image_mag_filter = GFX_FILTER_MODE_NEAREST;
if (!draw_frame.current) {
draw_frame.current = &first_block;
draw_frame.current->low_z = F32_MAX;
draw_frame.current->high_z = F32_MIN;
draw_frame.current->num_quads = 0;
draw_frame.num_blocks = 1;
quad.z = 0;
if (draw_frame.z_count > 0) quad.z = draw_frame.z_stack[draw_frame.z_count-1];
if (draw_frame.num_quads >= allocated_quads) {
// #Memory
u64 new_count = max(get_next_power_of_two(draw_frame.num_quads+1), 128);
Draw_Quad *new_buffer = alloc(get_heap_allocator(), new_count*sizeof(Draw_Quad));
if (quad_buffer) {
memcpy(new_buffer, quad_buffer, draw_frame.num_quads*sizeof(Draw_Quad));
dealloc(get_heap_allocator(), quad_buffer);
}
assert(draw_frame.current->num_quads <= QUADS_PER_BLOCK);
if (draw_frame.current->num_quads == QUADS_PER_BLOCK) {
if (!draw_frame.current->next) {
draw_frame.current->next = cast(Draw_Quad_Block*)alloc(get_heap_allocator(), sizeof(Draw_Quad_Block));
*draw_frame.current->next = ZERO(Draw_Quad_Block);
quad_buffer = new_buffer;
allocated_quads = new_count;
}
draw_frame.current = draw_frame.current->next;
draw_frame.current->num_quads = 0;
draw_frame.current->low_z = F32_MAX;
draw_frame.current->high_z = F32_MIN;
quad_buffer[draw_frame.num_quads] = quad;
draw_frame.num_quads += 1;
draw_frame.num_blocks += 1;
}
draw_frame.current->quad_buffer[draw_frame.current->num_quads] = quad;
draw_frame.current->num_quads += 1;
return &draw_frame.current->quad_buffer[draw_frame.current->num_quads-1];
return &quad_buffer[draw_frame.num_quads-1];
}
Draw_Quad *draw_quad(Draw_Quad quad) {
return draw_quad_projected(quad, m4_mul(draw_frame.projection, m4_inverse(draw_frame.view)));

View file

@ -19,6 +19,8 @@ int entry(int argc, char **argv) {
rect_xform = m4_translate(rect_xform, v3(-.25f, -.25f, 0));
draw_rect_xform(rect_xform, v2(.5f, .5f), COLOR_GREEN);
draw_rect(v2(sin(now), -.8), v2(.5, .25), COLOR_RED);
gfx_update();
}

View file

@ -87,8 +87,12 @@ int entry(int argc, char **argv) {
camera_view = m4_translate(camera_view, v3(v2_expand(cam_move), 0));
draw_frame.view = camera_view;
local_persist bool do_enable_z_sorting = false;
draw_frame.enable_z_sorting = do_enable_z_sorting;
if (is_key_just_pressed('Z')) do_enable_z_sorting = !do_enable_z_sorting;
seed_for_random = 69;
for (u64 i = 0; i < 100000; i++) {
for (u64 i = 0; i < 50000; i++) {
float32 aspect = (float32)window.width/(float32)window.height;
float min_x = -aspect;
float max_x = aspect;
@ -98,15 +102,18 @@ int entry(int argc, char **argv) {
float x = get_random_float32() * (max_x-min_x) + min_x;
float y = get_random_float32() * (max_y-min_y) + min_y;
push_z_layer((s32)(y*100));
draw_image(bush_image, v2(x, y), v2(0.1, 0.1), COLOR_WHITE);
pop_z_layer();
}
seed_for_random = os_get_current_cycle_count();
Matrix4 hammer_xform = m4_scalar(1.0);
hammer_xform = m4_rotate_z(hammer_xform, (f32)now);
hammer_xform = m4_translate(hammer_xform, v3(-.25f, -.25f, 0));
push_z_layer(1000001);
draw_image_xform(hammer_image, hammer_xform, v2(.5f, .5f), COLOR_RED);
pop_z_layer();
Vector2 hover_position = v2_rotate_point_around_pivot(v2(-.5, -.5), v2(0, 0), (f32)now);
Vector2 local_pivot = v2(.125f, .125f);

View file

@ -15,7 +15,6 @@ string temp_win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16);
typedef struct alignat(16) D3D11_Vertex {
Vector4 color;
Vector4 position;
Vector2 uv;
@ -59,6 +58,9 @@ ID3D11Buffer *d3d11_quad_vbo = 0;
u32 d3d11_quad_vbo_size = 0;
void *d3d11_staging_quad_buffer = 0;
Draw_Quad *sort_quad_buffer = 0;
u64 sort_quad_buffer_size = 0;
const char* d3d11_stringify_category(D3D11_MESSAGE_CATEGORY category) {
switch (category) {
case D3D11_MESSAGE_CATEGORY_APPLICATION_DEFINED: return "Application Defined";
@ -545,7 +547,7 @@ void d3d11_process_draw_frame() {
///
// Maybe grow quad vbo
u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6;
u32 required_size = sizeof(D3D11_Vertex) * allocated_quads*6;
if (required_size > d3d11_quad_vbo_size) {
if (d3d11_quad_vbo) {
@ -567,7 +569,7 @@ void d3d11_process_draw_frame() {
log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size);
}
if (draw_frame.num_blocks > 0) {
if (draw_frame.num_quads > 0) {
///
// Render geometry from into vbo quad list
@ -580,14 +582,24 @@ void d3d11_process_draw_frame() {
D3D11_Vertex* head = (D3D11_Vertex*)d3d11_staging_quad_buffer;
D3D11_Vertex* pointer = head;
u64 number_of_rendered_quads = 0;
Draw_Quad_Block *block = &first_block;
tm_scope_cycles("Quad processing") {
u64 block_index = 0;
while (block != 0 && block->num_quads > 0 && block_index < draw_frame.num_blocks) tm_scope_cycles("Quad block") {
for (u64 i = 0; i < block->num_quads; i++) {
if (draw_frame.enable_z_sorting) tm_scope_cycles("Z sorting") {
if (!sort_quad_buffer || (sort_quad_buffer_size < allocated_quads*sizeof(Draw_Quad))) {
// #Memory #Heapalloc
if (sort_quad_buffer) dealloc(get_heap_allocator(), sort_quad_buffer);
sort_quad_buffer = alloc(get_heap_allocator(), allocated_quads*sizeof(Draw_Quad));
sort_quad_buffer_size = allocated_quads*sizeof(Draw_Quad);
}
radix_sort(quad_buffer, sort_quad_buffer, draw_frame.num_quads, sizeof(Draw_Quad), offsetof(Draw_Quad, z), MAX_Z_BITS);
}
Draw_Quad *q = &block->quad_buffer[i];
for (u64 i = 0; i < draw_frame.num_quads; i++) {
Draw_Quad *q = &quad_buffer[i];
assert(q->z <= MAX_Z, "Z is too high. Z is %d, Max is %d.", q->z, MAX_Z);
assert(q->z >= (-MAX_Z+1), "Z is too low. Z is %d, Min is %d.", q->z, -MAX_Z+1);
s8 texture_index = -1;
@ -691,10 +703,6 @@ void d3d11_process_draw_frame() {
number_of_rendered_quads += 1;
}
}
block_index += 1;
block = block->next;
}
}
tm_scope_cycles("Write to gpu") {
@ -738,7 +746,9 @@ void gfx_update() {
d3d11_process_draw_frame();
tm_scope_cycles("Present") {
VTABLE(Present, d3d11_swap_chain, window.enable_vsync, window.enable_vsync ? 0 : DXGI_PRESENT_ALLOW_TEARING);
}
#if CONFIGURATION == DEBUG

View file

@ -1,80 +0,0 @@
// #Temporary #Cleanup
// #Temporary #Cleanup
// #Temporary #Cleanup
// #Temporary #Cleanup
#include "GL/gl.h"
HDC hdc;
typedef BOOL (APIENTRY *PFNWGLSWAPINTERVALEXTPROC) (int interval);
const Gfx_Handle GFX_INVALID_HANDLE = 0;
void gfx_init() {
// #Temporary #Cleanup
// #Temporary #Cleanup
// #Temporary #Cleanup
// #Temporary #Cleanup
PIXELFORMATDESCRIPTOR pfd = {
sizeof(PIXELFORMATDESCRIPTOR),
1,
PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER,
PFD_TYPE_RGBA,
32,
0, 0, 0, 0, 0, 0,
0, 0,
0, 0, 0, 0, 0,
24,
8,
0,
PFD_MAIN_PLANE,
0,
0, 0, 0
};
hdc = GetDC(window._os_handle);
int pixelFormat = ChoosePixelFormat(hdc, &pfd);
SetPixelFormat(hdc, pixelFormat, &pfd);
HGLRC hglrc = wglCreateContext(hdc);
wglMakeCurrent(hdc, hglrc);
PFNWGLSWAPINTERVALEXTPROC wglSwapIntervalEXT = (PFNWGLSWAPINTERVALEXTPROC) wglGetProcAddress("wglSwapIntervalEXT");
assert(wglSwapIntervalEXT, "Could not load wglSwapIntervalEXT");
wglSwapIntervalEXT(0);
glEnable(GL_BLEND);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
}
void gfx_update() {
Draw_Quad_Block *block = &draw_frame.first_block;
glBegin(GL_QUADS);
while (block != 0) {
for (u64 i = 0; i < block->num_quads; i++) {
Draw_Quad q = block->quad_buffer[i];
glColor4f(v4_expand(q.color));
glVertex2f(v2_expand(q.bottom_left));
glVertex2f(v2_expand(q.top_left));
glVertex2f(v2_expand(q.top_right));
glVertex2f(v2_expand(q.bottom_right));
}
block = block->next;
}
glEnd();
SwapBuffers(hdc);
glClearColor(window.clear_color.r, window.clear_color.g, window.clear_color.b, window.clear_color.a);
glClear(GL_COLOR_BUFFER_BIT);
glViewport(0, 0, window.width, window.height);
draw_frame = ZERO(Draw_Frame);
}

View file

@ -489,6 +489,9 @@ thread_local void * temporary_storage_pointer = 0;
thread_local bool has_warned_temporary_storage_overflow = false;
thread_local Allocator temp;
Allocator get_temporary_allocator() {
return temp;
}
void* temp_allocator_proc(u64 size, void *p, Allocator_Message message, void* data) {
switch (message) {

View file

@ -107,7 +107,7 @@
#define OGB_VERSION_MAJOR 0
#define OGB_VERSION_MINOR 0
#define OGB_VERSION_PATCH 4
#define OGB_VERSION_PATCH 5
#define OGB_VERSION (OGB_VERSION_MAJOR*1000000+OGB_VERSION_MINOR*1000+OGB_VERSION_PATCH)
@ -266,6 +266,7 @@ typedef u8 bool;
#include "path_utils.c"
#include "linmath.c"
#include "range.c"
#include "utility.c"
#include "hash_table.c"
@ -292,8 +293,6 @@ typedef u8 bool;
#error "We only have a D3D11 renderer at the moment"
#elif GFX_RENDERER == GFX_RENDERER_METAL
#error "We only have a D3D11 renderer at the moment"
#elif GFX_RENDERER == GFX_RENDERER_LEGACY_OPENGL
#include "gfx_impl_legacy_opengl.c"
#else
#error "Unknown renderer GFX_RENDERER defined"
#endif

View file

@ -139,7 +139,6 @@ void os_init(u64 program_memory_size) {
memset(&window, 0, sizeof(window));
timeBeginPeriod(1);
#if CONFIGURATION == RELEASE
SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
#endif
@ -364,7 +363,6 @@ bool os_grow_program_memory(u64 new_size) {
DWORD WINAPI win32_thread_invoker(LPVOID param) {
timeBeginPeriod(1);
#if CONFIGURATION == RELEASE
SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
#endif
@ -518,13 +516,15 @@ void os_high_precision_sleep(f64 ms) {
s32 sleep_time = (s32)((end-start)-1.0);
bool do_sleep = sleep_time >= 1;
timeBeginPeriod(1); // I don't see a reason to reset this
timeBeginPeriod(1);
if (do_sleep) os_sleep(sleep_time);
while (os_get_current_time_in_seconds() < end) {
os_yield_thread();
}
timeEndPeriod(1);
}

View file

@ -24,7 +24,7 @@ void _profiler_report_time_cycles(string name, u64 count, u64 start) {
spinlock_acquire_or_wait(&_profiler_lock);
string fmt = STR("{\"cat\":\"function\",\"dur\":%.3f,\"name\":\"%s\",\"ph\":\"X\",\"pid\":0,\"tid\":%zu,\"ts\":%lld},");
string_builder_print(&_profile_output, fmt, (float64)count*1000, name, GetCurrentThreadId(), start*1000);
string_builder_print(&_profile_output, fmt, (float64)count*1000, name, context.thread_id, start*1000);
spinlock_release(&_profiler_lock);
}

View file

@ -3,6 +3,15 @@
// I know that we'll have a Range2i at some point, so maybe it's better to be explicit for less confusion?
// I'll leave this decision up to u charlie just delete this whenever u see it
// charlie:
// Is this range stuff really necessary?
// Why not just:
// typedef Vector2 Range1f;
// typedef Vector4 Range2f;
// Vector4 also already have alias for x1, y1, x2, y2 and we could add an alias for min & max vectors (see linmath.c)
// This feels like introducing unnecessary complexity and vocabulary when it's really just
// another way to say Vector2 and Vector4.
typedef struct Range1f {
float min;
float max;

View file

@ -1141,10 +1141,80 @@ void test_mutex() {
mutex_destroy(&data.mutex);
}
int compare_draw_quads(const void *a, const void *b) {
return ((Draw_Quad*)a)->z-((Draw_Quad*)b)->z;
}
void test_sort() {
int num_samples = 100;
u64 id_bits = 21;
u64 item_count = 5000;
f64 seconds = 0;
u64 cycles = 0;
Draw_Quad *items = alloc(get_heap_allocator(), (item_count * 2) * sizeof(Draw_Quad));
Draw_Quad *buffer = items + item_count;
for (int a = 0; a < num_samples; a++) {
for (u64 i = 0; i < item_count; i++) {
if (i % 2 == 0) items[i].z = get_random_int_in_range(0, pow(2, id_bits) / 2);
else items[i].z = i;
}
u64 item_size = sizeof(Draw_Quad);
u64 sort_value_offset_in_item = offsetof(Draw_Quad, z);
float64 start_seconds = os_get_current_time_in_seconds();
u64 start_cycles = os_get_current_cycle_count();
radix_sort(items, buffer, item_count, item_size, sort_value_offset_in_item, id_bits);
u64 end_cycles = os_get_current_cycle_count();
float64 end_seconds = os_get_current_time_in_seconds();
for (u64 i = 1; i < item_count; i++) {
assert(items[i].z >= items[i-1].z, "Failed: not correctly sorted");
}
seconds += end_seconds - start_seconds;
cycles += end_cycles - start_cycles;
}
print("Radix sort took on average %llu cycles and %.2f ms\n", cycles / num_samples, (seconds * 1000.0) / (float64)num_samples);
seconds = 0;
cycles = 0;
for (int a = 0; a < num_samples; a++) {
for (u64 i = 0; i < item_count; i++) {
if (i % 2 == 0) items[i].z = get_random_int_in_range(0, pow(2, id_bits) / 2);
else items[i].z = i;
}
u64 item_size = sizeof(Draw_Quad);
u64 sort_value_offset_in_item = offsetof(Draw_Quad, z);
float64 start_seconds = os_get_current_time_in_seconds();
u64 start_cycles = os_get_current_cycle_count();
merge_sort(items, buffer, item_count, item_size, compare_draw_quads);
u64 end_cycles = os_get_current_cycle_count();
float64 end_seconds = os_get_current_time_in_seconds();
for (u64 i = 1; i < item_count; i++) {
assert(items[i].z >= items[i-1].z, "Failed: not correctly sorted");
}
seconds += end_seconds - start_seconds;
cycles += end_cycles - start_cycles;
}
print("Merge sort took on average %llu cycles and %.2f ms\n", cycles / num_samples, (seconds * 1000.0) / (float64)num_samples);
}
void oogabooga_run_tests() {
print("Testing allocator... ");
/*print("Testing allocator... ");
test_allocator(true);
print("OK!\n");
@ -1178,7 +1248,9 @@ void oogabooga_run_tests() {
print("Testing mutex... ");
test_mutex();
print("OK!\n");*/
print("Testing radix sort... ");
test_sort();
print("OK!\n");
}

103
oogabooga/utility.c Normal file
View file

@ -0,0 +1,103 @@
// This is a very niche sort algorithm.
// I use it for Z sorting quads.
// help_buffer should be same size as collection.
// This only works with integers, and it will use the first number_of_bits in the integer
// at sort_value_offset_in_item for sorting.
// There is a cost of memory as we need to double the buffer we're sorting BUT the performance
// gain is very promising.
// At 21 bits I'm able to sort a completely randomized collection of 100k integers at around
// 8m cycles (or 2.5-2.6ms on my shitty laptop i5-11300H)
void radix_sort(void *collection, void *help_buffer, u64 item_count, u64 item_size, u64 sort_value_offset_in_item, u64 number_of_bits) {
local_persist const int RADIX = 256;
local_persist const int BITS_PER_PASS = 8;
local_persist const int MASK = (RADIX - 1);
const int PASS_COUNT = ((number_of_bits + BITS_PER_PASS - 1) / BITS_PER_PASS);
const u64 SIGN_SHIFT = 1ULL << (number_of_bits - 1);
u64* count = (u64*)alloc(get_temporary_allocator(), RADIX * sizeof(u64));
u8* items = (u8*)collection;
u8* buffer = (u8*)help_buffer;
for (u32 pass = 0; pass < PASS_COUNT; ++pass) {
u32 shift = pass * BITS_PER_PASS;
for (u32 i = 0; i < RADIX; ++i) {
count[i] = 0;
}
for (u64 i = 0; i < item_count; ++i) {
u64 sort_value = *(u64*)(items + i * item_size + sort_value_offset_in_item);
sort_value += SIGN_SHIFT; // Transform the value to handle negative numbers
u32 digit = (sort_value >> shift) & MASK;
++count[digit];
}
u64 sum = 0;
for (u32 i = 0; i < RADIX; ++i) {
u64 temp = count[i];
count[i] = sum;
sum += temp;
}
for (u64 i = 0; i < item_count; ++i) {
u64 sort_value = *(u64*)(items + i * item_size + sort_value_offset_in_item);
u64 transformed_value = sort_value + SIGN_SHIFT; // Transform the value to handle negative numbers
u32 digit = (transformed_value >> shift) & MASK;
memcpy(buffer + count[digit] * item_size, items + i * item_size, item_size);
++count[digit];
}
memcpy(items, buffer, item_count * item_size);
}
dealloc(get_temporary_allocator(), count);
}
void merge_sort(void *collection, void *help_buffer, u64 item_count, u64 item_size, int (*compare)(const void *, const void *)) {
u8 *items = (u8 *)collection;
u8 *buffer = (u8 *)help_buffer;
for (u64 width = 1; width < item_count; width *= 2) {
for (u64 i = 0; i < item_count; i += 2 * width) {
u64 left = i;
u64 right = (i + width < item_count) ? (i + width) : item_count;
u64 end = (i + 2 * width < item_count) ? (i + 2 * width) : item_count;
u64 left_index = left;
u64 right_index = right;
u64 k = left;
while (left_index < right && right_index < end) {
if (compare(items + left_index * item_size, items + right_index * item_size) <= 0) {
memcpy(buffer + k * item_size, items + left_index * item_size, item_size);
left_index++;
} else {
memcpy(buffer + k * item_size, items + right_index * item_size, item_size);
right_index++;
}
k++;
}
while (left_index < right) {
memcpy(buffer + k * item_size, items + left_index * item_size, item_size);
left_index++;
k++;
}
while (right_index < end) {
memcpy(buffer + k * item_size, items + right_index * item_size, item_size);
right_index++;
k++;
}
for (u64 j = left; j < end; j++) {
memcpy(items + j * item_size, buffer + j * item_size, item_size);
}
}
}
}