diff --git a/oogabooga/d3d11_image_shader_bytecode.c b/oogabooga/d3d11_image_shader_bytecode.c index 2964655..9349ffd 100644 --- a/oogabooga/d3d11_image_shader_bytecode.c +++ b/oogabooga/d3d11_image_shader_bytecode.c @@ -1,6 +1,7 @@ /* <<<<<< Bytecode compiled fro HLSL code below: >>>>>> + struct VS_INPUT { float4 position : POSITION; @@ -9,10 +10,6 @@ struct VS_INPUT int texture_index : TEXTURE_INDEX; uint type : TYPE; uint sampler_index : SAMPLER_INDEX; - // s8 texture_index - // u8 type - // u8 sampler_index - // u8 }; struct PS_INPUT diff --git a/oogabooga/dev/d3d11_image_shader.hlsl b/oogabooga/dev/d3d11_image_shader.hlsl index c1ce13c..dc24c16 100644 --- a/oogabooga/dev/d3d11_image_shader.hlsl +++ b/oogabooga/dev/d3d11_image_shader.hlsl @@ -1,3 +1,4 @@ + struct VS_INPUT { float4 position : POSITION; @@ -6,10 +7,6 @@ struct VS_INPUT int texture_index : TEXTURE_INDEX; uint type : TYPE; uint sampler_index : SAMPLER_INDEX; - // s8 texture_index - // u8 type - // u8 sampler_index - // u8 }; struct PS_INPUT diff --git a/oogabooga/examples/renderer_stress_test.c b/oogabooga/examples/renderer_stress_test.c index 091d3b4..3f09f5d 100644 --- a/oogabooga/examples/renderer_stress_test.c +++ b/oogabooga/examples/renderer_stress_test.c @@ -28,7 +28,7 @@ int entry(int argc, char **argv) { render_atlas_if_not_yet_rendered(font, 32, 'A'); - seed_for_random = os_get_current_cycle_count(); + seed_for_random = rdtsc(); const float64 fps_limit = 69000; const float64 min_frametime = 1.0 / fps_limit; @@ -106,7 +106,7 @@ int entry(int argc, char **argv) { draw_image(bush_image, v2(x, y), v2(0.1, 0.1), COLOR_WHITE); pop_z_layer(); } - seed_for_random = os_get_current_cycle_count(); + seed_for_random = rdtsc(); Matrix4 hammer_xform = m4_scalar(1.0); hammer_xform = m4_rotate_z(hammer_xform, (f32)now); diff --git a/oogabooga/font.c b/oogabooga/font.c index 099482e..f2e3b8a 100644 --- a/oogabooga/font.c +++ b/oogabooga/font.c @@ -161,7 +161,7 @@ void font_variation_init(Gfx_Font_Variation *variation, Gfx_Font *font, u32 font // This one is bottom-top as opposed to normally in stbtt where it's top-bottom int x0, y0, x1, y1; stbtt_GetCodepointBitmapBox(&font->stbtt_handle, (int)c, variation->scale, variation->scale, &x0, &y0, &x1, &y1); - float c_ascent = (float)(y1-y0); // #Bugprone #Cleanup I am not at all sure about this! + float c_ascent = (float)(y1-y0); if (c_ascent > variation->metrics.latin_ascent) variation->metrics.latin_ascent = c_ascent; diff --git a/oogabooga/memory.c b/oogabooga/memory.c index e69a4cb..3823a1a 100644 --- a/oogabooga/memory.c +++ b/oogabooga/memory.c @@ -218,7 +218,6 @@ Heap_Block *make_heap_block(Heap_Block *parent, u64 size) { - // #Speed #Cleanup if (((u8*)block)+size >= ((u8*)program_memory)+program_memory_size) { u64 minimum_size = ((u8*)block+size) - (u8*)program_memory + 1; u64 new_program_size = get_next_power_of_two(minimum_size); diff --git a/oogabooga/os_impl_windows.c b/oogabooga/os_impl_windows.c index 8714b4e..0ba1712 100644 --- a/oogabooga/os_impl_windows.c +++ b/oogabooga/os_impl_windows.c @@ -255,22 +255,17 @@ void os_init(u64 program_memory_size) { assert(os.crt != 0, "Could not load win32 crt library. Might be compiled with non-msvc? #Incomplete #Portability"); os.crt_vsnprintf = (Crt_Vsnprintf_Proc)os_dynamic_library_load_symbol(os.crt, STR("vsnprintf")); assert(os.crt_vsnprintf, "Missing vsnprintf in crt"); - os.crt_vprintf = (Crt_Vprintf_Proc)os_dynamic_library_load_symbol(os.crt, STR("vprintf")); - assert(os.crt_vprintf, "Missing vprintf in crt"); - os.crt_vsprintf = (Crt_Vsprintf_Proc)os_dynamic_library_load_symbol(os.crt, STR("vsprintf")); - assert(os.crt_vsprintf, "Missing vsprintf in crt"); - os.crt_memcpy = (Crt_Memcpy_Proc)os_dynamic_library_load_symbol(os.crt, STR("memcpy")); - assert(os.crt_memcpy, "Missing memcpy in crt"); - os.crt_memcmp = (Crt_Memcmp_Proc)os_dynamic_library_load_symbol(os.crt, STR("memcmp")); - assert(os.crt_memcmp, "Missing crt_memcmp in crt"); - os.crt_memset = (Crt_Memset_Proc)os_dynamic_library_load_symbol(os.crt, STR("memset")); - assert(os.crt_memset, "Missing memset in crt"); win32_init_window(); - os_start_thread(os_make_thread(win32_audio_thread, get_heap_allocator())); - os_start_thread(os_make_thread(win32_audio_poll_default_device_thread, get_heap_allocator())); + local_persist Thread audio_thread, audio_poll_default_device_thread; + + os_thread_init(&audio_thread, win32_audio_thread); + os_thread_init(&audio_poll_default_device_thread, win32_audio_poll_default_device_thread); + + os_thread_start(&audio_thread); + os_thread_start(&audio_poll_default_device_thread); while (!win32_has_audio_thread_started) { os_yield_thread(); } } @@ -404,6 +399,8 @@ DWORD WINAPI win32_thread_invoker(LPVOID param) { return 0; } + +////// DEPRECATED vvvvvvvvvvvvvvvvv Thread* os_make_thread(Thread_Proc proc, Allocator allocator) { Thread *t = (Thread*)alloc(allocator, sizeof(Thread)); t->id = 0; // This is set when we start it @@ -433,6 +430,33 @@ void os_start_thread(Thread *t) { void os_join_thread(Thread *t) { WaitForSingleObject(t->os_handle, INFINITE); } +////// DEPRECATED ^^^^^^^^^^^^^^^^ + +void os_thread_init(Thread *t, Thread_Proc proc) { + memset(t, 0, sizeof(Thread)); + t->id = 0; + t->proc = proc; + t->initial_context = context; +} +void os_thread_destroy(Thread *t) { + os_thread_join(t); + CloseHandle(t->os_handle); +} +void os_thread_start(Thread *t) { + t->os_handle = CreateThread( + 0, + 0, + win32_thread_invoker, + t, + 0, + (DWORD*)&t->id + ); + + assert(t->os_handle, "Failed creating thread"); +} +void os_thread_join(Thread *t) { + WaitForSingleObject(t->os_handle, INFINITE); +} /// // Mutex primitive @@ -474,59 +498,6 @@ void os_unlock_mutex(Mutex_Handle m) { assert(result, "Unlock mutex 0x%x failed with error %d", m, GetLastError()); } -/// -// Spinlock "primitive" - -Spinlock *os_make_spinlock(Allocator allocator) { - // #Memory #Cleanup do we need to heap allocate this ? - Spinlock *l = cast(Spinlock*)alloc(allocator, sizeof(Spinlock)); - l->locked = false; - return l; -} -void os_spinlock_lock(Spinlock *l) { - while (true) { - bool expected = false; - if (compare_and_swap_bool(&l->locked, true, expected)) { - return; - } - while (l->locked) { - // spinny boi - } - } -} - -void os_spinlock_unlock(Spinlock *l) { - bool expected = true; - bool success = compare_and_swap_bool(&l->locked, false, expected); - assert(success, "This thread should have acquired the spinlock but compare_and_swap failed"); -} - - -/// -// Concurrency utilities - -bool os_compare_and_swap_8(u8 *a, u8 b, u8 old) { - // #Portability not sure how portable this is. - return _InterlockedCompareExchange8((volatile CHAR*)a, (CHAR)b, (CHAR)old) == (CHAR)old; -} - -bool os_compare_and_swap_16(u16 *a, u16 b, u16 old) { - return InterlockedCompareExchange16((volatile SHORT*)a, (SHORT)b, (SHORT)old) == (SHORT)old; -} - -bool os_compare_and_swap_32(u32 *a, u32 b, u32 old) { - return InterlockedCompareExchange((volatile LONG*)a, (LONG)b, (LONG)old) == (LONG)old; -} - -bool os_compare_and_swap_64(u64 *a, u64 b, u64 old) { - return InterlockedCompareExchange64((volatile LONG64*)a, (LONG64)b, (LONG64)old) == (LONG64)old; -} - -bool os_compare_and_swap_bool(bool *a, bool b, bool old) { - return os_compare_and_swap_8(cast(u8*)a, cast(u8)b, cast(u8)old); -} - - void os_sleep(u32 ms) { Sleep(ms); diff --git a/oogabooga/os_interface.c b/oogabooga/os_interface.c index ad992a3..b54d0b8 100644 --- a/oogabooga/os_interface.c +++ b/oogabooga/os_interface.c @@ -32,13 +32,7 @@ #define _INTSIZEOF(n) ((sizeof(n) + sizeof(int) - 1) & ~(sizeof(int) - 1)) -// #Cleanup we only need vsnprintf -typedef void* (__cdecl *Crt_Memcpy_Proc) (void*, const void*, size_t); -typedef int (__cdecl *Crt_Memcmp_Proc) (const void*, const void*, size_t); -typedef void* (__cdecl *Crt_Memset_Proc) (void*, int, size_t); -typedef int (__cdecl *Crt_Vprintf_Proc) (const char*, va_list); typedef int (__cdecl *Crt_Vsnprintf_Proc) (char*, size_t, const char*, va_list); -typedef int (__cdecl *Crt_Vsprintf_Proc) (char*, const char*, va_list); typedef struct Os_Info { u64 page_size; @@ -46,37 +40,19 @@ typedef struct Os_Info { Dynamic_Library_Handle crt; - // #Cleanup we only need vsnprintf - Crt_Memcpy_Proc crt_memcpy; - Crt_Memcmp_Proc crt_memcmp; - Crt_Memset_Proc crt_memset; - Crt_Vprintf_Proc crt_vprintf; Crt_Vsnprintf_Proc crt_vsnprintf; - Crt_Vsprintf_Proc crt_vsprintf; void *static_memory_start, *static_memory_end; } Os_Info; Os_Info os; -inline int crt_vprintf(const char* fmt, va_list args) { - return os.crt_vprintf(fmt, args); -} - inline bool bytes_match(void *a, void *b, u64 count) { return memcmp(a, b, count) == 0; } inline int vsnprintf(char* buffer, size_t n, const char* fmt, va_list args) { return os.crt_vsnprintf(buffer, n, fmt, args); } -inline int crt_sprintf(char *str, const char *format, ...) { - va_list args; - va_start(args, format); - int r = os.crt_vsprintf(str, format, args); - va_end(args); - return r; -} - Mutex_Handle program_memory_mutex = 0; bool os_grow_program_memory(size_t new_size); @@ -91,22 +67,25 @@ typedef struct Thread Thread; typedef void(*Thread_Proc)(Thread*); typedef struct Thread { - u64 id; + u64 id; // This is valid after os_thread_start Context initial_context; void* data; Thread_Proc proc; Thread_Handle os_handle; - Allocator allocator; + Allocator allocator; // Deprecated !! #Cleanup } Thread; /// // Thread primitive -// #Cleanup this shouldn't be allocating just for the pointer!! Just do os_thread_init(*) -Thread* os_make_thread(Thread_Proc proc, Allocator allocator); -void os_destroy_thread(Thread *t); -void os_start_thread(Thread* t); -void os_join_thread(Thread* t); +DEPRECATED(Thread* os_make_thread(Thread_Proc proc, Allocator allocator), "Use os_thread_init instead"); +DEPRECATED(void os_destroy_thread(Thread *t), "Use os_thread_destroy instead"); +DEPRECATED(void os_start_thread(Thread* t), "Use os_thread_start instead"); +DEPRECATED(void os_join_thread(Thread* t), "Use os_thread_join instead"); +void os_thread_init(Thread *t, Thread_Proc proc); +void os_thread_destroy(Thread *t); +void os_thread_start(Thread *t); +void os_thread_join(Thread *t); /// @@ -116,27 +95,8 @@ void os_destroy_mutex(Mutex_Handle m); void os_lock_mutex(Mutex_Handle m); void os_unlock_mutex(Mutex_Handle m); -typedef struct Spinlock Spinlock; -// #Cleanup Moved to threading.c -DEPRECATED(Spinlock *os_make_spinlock(Allocator allocator), "use spinlock_init instead"); -DEPRECATED(void os_spinlock_lock(Spinlock* l), "use spinlock_acquire_or_wait instead"); -DEPRECATED(void os_spinlock_unlock(Spinlock* l), "use spinlock_release instead"); - /// -// Concurrency utilities - -// #Cleanup -// In retrospect, I'm not sure why I choose to implement this per OS. -// I think Win32 InterlockedCompareExchange just generates the cmpxchg -// instruction anyways, so may as well just inline asm it (or Win32 -// if we're compiling with msvc) (LDREX/STREX on ARM) -// - CharlieM July 8th 2024 -// compare_and_swap in cpu.c -DEPRECATED(bool os_compare_and_swap_8 (u8 *a, u8 b, u8 old), "use compare_and_swap instead"); -DEPRECATED(bool os_compare_and_swap_16 (u16 *a, u16 b, u16 old), "use compare_and_swap instead"); -DEPRECATED(bool os_compare_and_swap_32 (u32 *a, u32 b, u32 old), "use compare_and_swap instead"); -DEPRECATED(bool os_compare_and_swap_64 (u64 *a, u64 b, u64 old), "use compare_and_swap instead"); -DEPRECATED(bool os_compare_and_swap_bool(bool *a, bool b, bool old), "use compare_and_swap instead"); +// Threading utilities void os_sleep(u32 ms); void os_yield_thread(); @@ -147,8 +107,7 @@ void os_high_precision_sleep(f64 ms); // Time /// -// #Cleanup getting the cycle count is an x86 intrinsic so this should be in cpu.c -u64 os_get_current_cycle_count(); +DEPRECATED(u64 os_get_current_cycle_count(), "use rdtsc() instead"); float64 os_get_current_time_in_seconds(); diff --git a/oogabooga/tests.c b/oogabooga/tests.c index b546d47..18989c5 100644 --- a/oogabooga/tests.c +++ b/oogabooga/tests.c @@ -212,11 +212,12 @@ void test_thread_proc1(Thread* t) { void test_threads() { - Thread* t = os_make_thread(test_thread_proc1, get_heap_allocator()); - os_start_thread(t); + Thread t; + os_thread_init(&t, test_thread_proc1); + os_thread_start(&t); os_sleep(20); print("This should be printed in middle of thread execution\n"); - os_join_thread(t); + os_thread_join(&t); print("Thread is joined\n"); Mutex_Handle m = os_make_mutex(); @@ -767,65 +768,65 @@ void test_simd() { memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float)); memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float)); - u64 start = os_get_current_cycle_count(); + u64 start = rdtsc(); for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 16) { simd_mul_float32_512_aligned(&samples_a[i], &samples_b[i], &samples_a[i]); } - u64 end = os_get_current_cycle_count(); + u64 end = rdtsc(); u64 cycles = end-start; print("simd 512 float32 mul took %llu cycles\n", cycles); memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float)); memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float)); - start = os_get_current_cycle_count(); + start = rdtsc(); for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 8) { simd_mul_float32_256_aligned(&samples_a[i], &samples_b[i], &samples_a[i]); } - end = os_get_current_cycle_count(); + end = rdtsc(); cycles = end-start; print("simd 256 float32 mul took %llu cycles\n", cycles); memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float)); memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float)); - start = os_get_current_cycle_count(); + start = rdtsc(); for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 4) { simd_mul_float32_128_aligned(&samples_a[i], &samples_b[i], &samples_a[i]); } - end = os_get_current_cycle_count(); + end = rdtsc(); cycles = end-start; print("simd 128 float32 mul took %llu cycles\n", cycles); memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float)); memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float)); - start = os_get_current_cycle_count(); + start = rdtsc(); for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 2) { simd_mul_float32_64(&samples_a[i], &samples_b[i], &samples_a[i]); } - end = os_get_current_cycle_count(); + end = rdtsc(); cycles = end-start; print("simd 64 float32 mul took %llu cycles\n", cycles); memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float)); memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float)); - start = os_get_current_cycle_count(); + start = rdtsc(); for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 1) { samples_a[i] = samples_a[i] + samples_b[i]; } - end = os_get_current_cycle_count(); + end = rdtsc(); cycles = end-start; print("NO SIMD float32 mul took %llu cycles\n", cycles); } @@ -1061,7 +1062,7 @@ void test_hash_table() { void test_random_distribution() { int bins[NUM_BINS] = {0}; - seed_for_random = os_get_current_cycle_count(); + seed_for_random = rdtsc(); for (int i = 0; i < NUM_SAMPLES; i++) { f32 rand_val = get_random_float32(); int bin = (int)(rand_val * NUM_BINS); @@ -1124,16 +1125,16 @@ void test_mutex() { const int num_threads = 100; - Thread **threads = alloc(allocator, sizeof(Thread*)*num_threads); + Thread *threads = alloc(allocator, sizeof(Thread)*num_threads); for (u64 i = 0; i < num_threads; i++) { - threads[i] = os_make_thread(mutex_test_increment_counter, allocator); - threads[i]->data = &data; + os_thread_init(&threads[i], mutex_test_increment_counter); + threads[i].data = &data; } for (u64 i = 0; i < num_threads; i++) { - os_start_thread(threads[i]); + os_thread_start(&threads[i]); } for (u64 i = 0; i < num_threads; i++) { - os_join_thread(threads[i]); + os_thread_join(&threads[i]); } assert(data.counter == num_threads * MUTEX_TEST_TASK_COUNT, "Failed: Counter does not match expected value after threading tasks"); @@ -1167,9 +1168,9 @@ void test_sort() { u64 sort_value_offset_in_item = offsetof(Draw_Quad, z); float64 start_seconds = os_get_current_time_in_seconds(); - u64 start_cycles = os_get_current_cycle_count(); + u64 start_cycles = rdtsc(); radix_sort(items, buffer, item_count, item_size, sort_value_offset_in_item, id_bits); - u64 end_cycles = os_get_current_cycle_count(); + u64 end_cycles = rdtsc(); float64 end_seconds = os_get_current_time_in_seconds(); for (u64 i = 1; i < item_count; i++) { @@ -1195,9 +1196,9 @@ void test_sort() { u64 sort_value_offset_in_item = offsetof(Draw_Quad, z); float64 start_seconds = os_get_current_time_in_seconds(); - u64 start_cycles = os_get_current_cycle_count(); + u64 start_cycles = rdtsc(); merge_sort(items, buffer, item_count, item_size, compare_draw_quads); - u64 end_cycles = os_get_current_cycle_count(); + u64 end_cycles = rdtsc(); float64 end_seconds = os_get_current_time_in_seconds(); for (u64 i = 1; i < item_count; i++) {