- Replace lodepng with stb_image (& add stb_truetype for fonts)

- Fix d3d11 Input assembler not being created correctly bug
- Fix framerate being locked by swap chain present
- Move enable_vsync to window
- sqrt & rsqrt simd
- Add release build & run in vscode tasks & launch
- Cleanup
This commit is contained in:
Charlie 2024-07-04 20:56:27 +02:00
parent 4c5f882999
commit 05919248eb
26 changed files with 13557 additions and 305 deletions

.gitignore vendored
View file

@ -54,4 +54,6 @@ test_doc.vkn

.vscode/launch.json vendored
View file

@ -2,7 +2,7 @@
"version": "0.2.0",
"configurations": [
"name": "Launch with MSVC Debugger",
"name": "Launch Debug with MSVC Debugger",
"type": "cppvsdbg",
"request": "launch",
"program": "${workspaceFolder}/build/cgame.exe", // Run the output executable after compile
@ -11,7 +11,19 @@
"cwd": "${workspaceFolder}",
"environment": [],
// "preLaunchTask": "Compile"
"preLaunchTask": "Compile"
"name": "Launch Release with MSVC Debugger",
"type": "cppvsdbg",
"request": "launch",
"program": "${workspaceFolder}/build/release/cgame.exe", // Run the output executable after compile
"args": [],
"stopAtEntry": false,
"cwd": "${workspaceFolder}",
"environment": [],
"preLaunchTask": "Compile Release"

.vscode/tasks.json vendored
View file

@ -16,6 +16,21 @@
// "close": false,
// "showReuseMessage": true,
"label": "Compile Release",
"type": "shell",
"command": "${workspaceFolder}\\build_release",
"group": {
"kind": "build"
"problemMatcher": ["$gcc"],
"presentation": {
"clear": true,
// "revealProblems": "onProblem",
// "close": false,
// "showReuseMessage": true,

View file

@ -6,6 +6,6 @@ mkdir build
pushd build
clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -msse4.1
clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations -lkernel32 -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi

View file

@ -3,7 +3,7 @@
// Build config stuff
#define RUN_TESTS 1
#define RUN_TESTS 0
// This is only for people developing oogabooga!
@ -13,6 +13,7 @@
// ENABLE_SIMD Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which doesn't
#define ENABLE_SIMD 1
typedef struct Context_Extra {
@ -21,8 +22,6 @@ typedef struct Context_Extra {
// This needs to be defined before oogabooga if we want extra stuff in context
#define CONTEXT_EXTRA Context_Extra
// This defaults to "entry", but we can set it to anything (except "main" or other existing proc names"
#define ENTRY_PROC entry
@ -38,13 +37,13 @@ typedef struct Context_Extra {
// this is a minimal starting point for new projects. Copy & rename to get started
#include "oogabooga/examples/minimal_game_loop.c"
// #include "oogabooga/examples/minimal_game_loop.c"
// An engine dev stress test for rendering
// #include "oogabooga/examples/renderer_stress_test.c"
// Randy's example game that he's building out as a tutorial for using the engine
// #include "entry_randygame.c"
#include "entry_randygame.c"
// This is where you swap in your own project!
// #include "entry_yourepicgamename.c"

View file

@ -1,14 +1,18 @@
@echo off
rmdir /S /Q build
mkdir build
if exist build/dissassembly (
rmdir /s /q build
if not exist build (
mkdir build
pushd build
mkdir release
pushd release
mkdir dissassembly
pushd dissassembly
clang -o cgame.asm ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -ffast-math -funroll-loops -finline-functions -fvectorize -fslp-vectorize -fomit-frame-pointer -fno-exceptions -fno-rtti -S -masm=intel
clang -o cgame.asm ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -S -masm=intel

View file

@ -1,5 +1,7 @@
@echo off
rmdir /S /Q build
if exist build (
rmdir /s /q build
mkdir build
pushd build
@ -7,7 +9,7 @@ pushd build
mkdir release
pushd release
clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -msse4.1
clang -o cgame.exe ../../build.c -Ofast -DNDEBUG -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions

View file

@ -10,37 +10,22 @@
#define local_persist static
#define forward_global extern
// Haters gonna hate
#define If if (
#define then )
// If cond then {}
#ifdef _MSC_VER
inline void os_break() {
volatile int *a = 0;
*a = 5;
#error "Only msvc compiler supported at the moment";
void printf(const char* fmt, ...);
#define ASSERT_STR_HELPER(x) #x
#define assert_line(line, cond, ...) if(!(cond)) { printf("Assertion failed in file " __FILE__ " on line " ASSERT_STR(line) "\nFailed Condition: " #cond ". Message: " __VA_ARGS__); os_break(); }
#define assert(cond, ...) assert_line(__LINE__, cond, __VA_ARGS__);
#define assert_line(line, cond, ...) {if(!(cond)) { printf("Assertion failed in file " __FILE__ " on line " ASSERT_STR(line) "\nFailed Condition: " #cond ". Message: " __VA_ARGS__); crash(); }}
#define assert(cond, ...) {assert_line(__LINE__, cond, __VA_ARGS__)}
#define DEFER(start, end) for(int _i_ = ((start), 0); _i_ == 0; _i_ += 1, (end))
#undef assert
#define assert(...)
#define assert(...) (void)0;
#define panic(...) { print(__VA_ARGS__); os_break(); }
#define panic(...) { print(__VA_ARGS__); crash(); }
#define cast(t) (t)
@ -48,7 +33,6 @@ void printf(const char* fmt, ...);
#define FIRST_ARG(arg1, ...) arg1
#define SECOND_ARG(arg1, arg2, ...) arg2
#define print(...) _Generic((FIRST_ARG(__VA_ARGS__)), \

View file

@ -29,6 +29,11 @@ typedef struct Cpu_Capabilities {
#define inline __forceinline
#define alignat(x) __declspec(align(x))
inline void crash() {
volatile int *a = 0;
*a = 5;
#include <intrin.h>
#pragma intrinsic(__rdtsc)
inline u64 rdtsc() {
@ -66,6 +71,11 @@ typedef struct Cpu_Capabilities {
#define inline __attribute__((always_inline)) inline
#define alignat(x) __attribute__((aligned(x)))
inline void crash() {
volatile int *a = 0;
*a = 5;
inline u64 rdtsc() {
unsigned int lo, hi;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
@ -119,7 +129,6 @@ typedef struct Cpu_Capabilities {
#warning "Compiler is not explicitly supported, some things will probably not work as expected"
Cpu_Capabilities query_cpu_capabilities() {
Cpu_Capabilities result = {0};

View file

@ -3,10 +3,10 @@
struct VS_INPUT
float4 position : POSITION;
float2 uv : TEXCOORD;
float4 color : COLOR;
int texture_index: TEXTURE_INDEX;
float4 position : POSITION;
struct PS_INPUT
@ -79,8 +79,8 @@ float4 ps_main(PS_INPUT input) : SV_TARGET
0x44, 0x58, 0x42, 0x43, 0xdd, 0x02, 0x55, 0xb0, 0x7b, 0x83, 0x6c, 0x34, 0x45, 0xe8, 0x51, 0xd4,
0x76, 0xbf, 0x66, 0x77, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x03, 0x00, 0x00, 0x05, 0x00, 0x00,
0x44, 0x58, 0x42, 0x43, 0xf4, 0xea, 0x50, 0x9f, 0xcf, 0xeb, 0x01, 0x7b, 0x78, 0x58, 0xd5, 0x6b,
0x4f, 0x9f, 0xc1, 0xe2, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x03, 0x00, 0x00, 0x05, 0x00, 0x00,
0x00, 0x34, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x38, 0x01, 0x00, 0x00, 0xd4, 0x01,
0x00, 0x00, 0xa0, 0x02, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
@ -92,14 +92,14 @@ const u8 IMAGE_SHADER_VERTEX_BLOB_BYTES[]= {
0x6c, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2e, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4e, 0x90, 0x00,
0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x03, 0x03, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00, 0x00, 0x77, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02,
0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00,
0x00, 0x54, 0x45, 0x58, 0x43, 0x4f, 0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52,
0x00, 0x54, 0x45, 0x58, 0x54, 0x55, 0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00,
0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x00, 0xab, 0xab, 0x4f, 0x53, 0x47, 0x4e,
0x0f, 0x0f, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x7a, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02,
0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
0x00, 0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x00, 0x54, 0x45, 0x58, 0x43, 0x4f,
0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52, 0x00, 0x54, 0x45, 0x58, 0x54, 0x55,
0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00, 0xab, 0xab, 0x4f, 0x53, 0x47, 0x4e,
0x94, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -111,19 +111,19 @@ const u8 IMAGE_SHADER_VERTEX_BLOB_BYTES[]= {
0x54, 0x45, 0x58, 0x43, 0x4f, 0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52, 0x00,
0x54, 0x45, 0x58, 0x54, 0x55, 0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00, 0xab,
0xab, 0xab, 0x53, 0x48, 0x45, 0x58, 0xc4, 0x00, 0x00, 0x00, 0x50, 0x00, 0x01, 0x00, 0x31,
0x00, 0x00, 0x00, 0x6a, 0x08, 0x00, 0x01, 0x5f, 0x00, 0x00, 0x03, 0x32, 0x10, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00,
0x00, 0x5f, 0x00, 0x00, 0x03, 0x12, 0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x5f, 0x00,
0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x04, 0xf2,
0x00, 0x00, 0x00, 0x6a, 0x08, 0x00, 0x01, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x03, 0x32, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00,
0x00, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x5f, 0x00,
0x00, 0x03, 0x12, 0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x04, 0xf2,
0x20, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03,
0x32, 0x20, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03, 0xf2, 0x20, 0x10,
0x00, 0x02, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03, 0x12, 0x20, 0x10, 0x00, 0x03, 0x00,
0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0xf2, 0x20, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46,
0x1e, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x32, 0x20, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00,
0x05, 0xf2, 0x20, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x1e, 0x10, 0x00, 0x01, 0x00,
0x1e, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x32, 0x20, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00,
0x05, 0xf2, 0x20, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x1e, 0x10, 0x00, 0x02, 0x00,
0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x12, 0x20, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0a,
0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x94, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,

View file

@ -1,9 +1,9 @@
struct VS_INPUT
float4 position : POSITION;
float2 uv : TEXCOORD;
float4 color : COLOR;
int texture_index: TEXTURE_INDEX;
float4 position : POSITION;
struct PS_INPUT

View file

@ -197,65 +197,39 @@ Draw_Quad *draw_image_xform(Gfx_Image *image, Matrix4 xform, Vector2 size, Vecto
#define COLOR_BLACK ((Vector4){0.0, 0.0, 0.0, 1.0})
Gfx_Image *load_image_from_disk(string path, Allocator allocator) {
string png;
bool ok = os_read_entire_file(path, &png, allocator);
if (!ok) return 0;
string png;
bool ok = os_read_entire_file(path, &png, allocator);
if (!ok) return 0;
Gfx_Image *image = alloc(allocator, sizeof(Gfx_Image));
// This is fucking terrible I gotta write my own decoder
lodepng_allocator = allocator;
LodePNGState state;
u32 error = lodepng_inspect(&image->width, &image->height, &state, png.data, png.count);
if (error) {
return 0;
// 5 lines of code to say "ignore_adler32 = true" (because it's broken and gives me an error)
LodePNGDecoderSettings decoder;
decoder.zlibsettings.ignore_adler32 = true;
state.decoder = decoder;
error = lodepng_decode(&image->data, &image->width, &image->height, &state, png.data, png.count);
dealloc_string(allocator, png);
if (error) {
return 0;
// We need to flip the image
u32 row_bytes = image->width * 4; // #Magicvalue assuming 4 bytes
u8* temp_row = (u8*)alloc(temp, row_bytes);
for (u32 i = 0; i < image->height / 2; i++) {
u8* top_row = image->data + i * row_bytes;
u8* bottom_row = image->data + (image->height - i - 1) * row_bytes;
// Swap the top row with the bottom row
memcpy(temp_row, top_row, row_bytes);
memcpy(top_row, bottom_row, row_bytes);
memcpy(bottom_row, temp_row, row_bytes);
Gfx_Image *image = alloc(allocator, sizeof(Gfx_Image));
// Use stb_image to load and decode the PNG
int width, height, channels;
stbi_set_flip_vertically_on_load(1); // stb_image can flip the image on load
unsigned char* stb_data = stbi_load_from_memory(png.data, png.count, &width, &height, &channels, STBI_rgb_alpha);
if (!stb_data) {
dealloc(allocator, image);
dealloc_string(allocator, png);
return 0;
image->gfx_handle = GFX_INVALID_HANDLE; // This is handled in gfx
image->allocator = allocator;
return image;
image->data = stb_data;
image->width = width;
image->height = height;
image->gfx_handle = GFX_INVALID_HANDLE; // This is handled in gfx
image->allocator = allocator;
dealloc_string(allocator, png);
return image;
void delete_image(Gfx_Image *image) {
dealloc(image->allocator, image->data);
image->width = 0;
image->height = 0;
draw_frame.garbage_stack[draw_frame.garbage_stack_count] = image->gfx_handle;
draw_frame.garbage_stack_count += 1;
dealloc(image->allocator, image);
stbi_image_free(image->data); // Free the image data allocated by stb_image
image->width = 0;
image->height = 0;
draw_frame.garbage_stack[draw_frame.garbage_stack_count] = image->gfx_handle;
draw_frame.garbage_stack_count += 1;
dealloc(image->allocator, image);

View file

@ -15,8 +15,6 @@ int entry(int argc, char **argv) {
Gfx_Image *hammer_image = load_image_from_disk(STR("oogabooga/examples/hammer.png"), get_heap_allocator());
assert(hammer_image, "Failed loading hammer.png");
Gfx_Font *font = load_font_From_disk(
seed_for_random = os_get_current_cycle_count();
const float64 fps_limit = 69000;
@ -36,7 +34,9 @@ int entry(int argc, char **argv) {
delta = now - last_time;
last_time = now;
tm_scope_cycles("os_update") {
if (is_key_just_released(KEY_ESCAPE)) {
window.should_close = true;
@ -102,11 +102,10 @@ int entry(int argc, char **argv) {
draw_image(bush_image, v2(0.65, 0.65), v2(0.2*sin(now), 0.2*sin(now)), COLOR_WHITE);
draw_frame.font = STR("");
tm_scope_cycles("gfx_update") {
if (is_key_just_released('E')) {
log("FPS: %.2f", 1.0 / delta);

View file

@ -13,10 +13,10 @@ const Gfx_Handle GFX_INVALID_HANDLE = 0;
string temp_win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16);
typedef struct D3D11_Vertex {
typedef struct alignat(16) D3D11_Vertex {
Vector4 color;
Vector4 position;
Vector2 uv;
Vector4 color;
int texture_index;
} D3D11_Vertex;
@ -81,14 +81,19 @@ void CALLBACK d3d11_debug_callback(D3D11_MESSAGE_CATEGORY category, D3D11_MESSAG
@ -127,7 +132,8 @@ void d3d11_update_swapchain() {
if (create) {
scd.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
//scd.BufferDesc.RefreshRate.Numerator = xx st.refresh_rate;
//scd.BufferDesc.RefreshRate.Numerator = 0;
//scd.BufferDesc.RefreshRate.Denominator = 1;
@ -137,23 +143,23 @@ void d3d11_update_swapchain() {
scd.Scaling = DXGI_SCALING_STRETCH; // for compatability with 7
// Windows 10 allows to use DXGI_SWAP_EFFECT_FLIP_DISCARD
// for Windows 8 compatibility use DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL
// for Windows 7 compatibility use DXGI_SWAP_EFFECT_DISCARD
if (d3d11_feature_level >= D3D_FEATURE_LEVEL_11_0) {
// this is supported only on FLIP presentation model
scd.Scaling = DXGI_SCALING_NONE;
scd.BufferCount = 3;
gfx._can_vsync = false;
log_verbose("Present mode is flip discard, 3 buffers");
} else {
scd.BufferCount = 2;
gfx._can_vsync = true;
log_verbose("Present mode is discard, 2 buffers");
// Obtain DXGI factory from device
IDXGIDevice *dxgi_device;
hr = VTABLE(QueryInterface, d3d11_device, &IID_IDXGIDevice, cast(void**)&dxgi_device);
@ -224,7 +230,7 @@ void d3d11_update_swapchain() {
void gfx_init() {
gfx.enable_vsync = false;
window.enable_vsync = false;
log_verbose("d3d11 gfx_init");
@ -426,42 +432,53 @@ void gfx_init() {
log_verbose("Shaders created");
memset(layout, 0, sizeof(layout));
layout[0].SemanticName = "POSITION";
layout[0].SemanticIndex = 0;
layout[0].Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
layout[0].InputSlot = 0;
layout[0].AlignedByteOffset = offsetof(D3D11_Vertex, position);
layout[0].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
layout[0].InstanceDataStepRate = 0;
layout[1].SemanticName = "TEXCOORD";
layout[1].SemanticIndex = 0;
layout[1].Format = DXGI_FORMAT_R32G32_FLOAT;
layout[1].InputSlot = 0;
layout[1].AlignedByteOffset = offsetof(D3D11_Vertex, uv);
layout[1].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
layout[1].InstanceDataStepRate = 0;
layout[2].SemanticName = "COLOR";
layout[2].SemanticIndex = 0;
layout[2].Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
layout[2].InputSlot = 0;
layout[2].AlignedByteOffset = offsetof(D3D11_Vertex, color);
layout[2].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
layout[2].InstanceDataStepRate = 0;
layout[3].SemanticName = "TEXTURE_INDEX";
layout[3].SemanticIndex = 0;
layout[3].Format = DXGI_FORMAT_R32_SINT;
layout[3].InputSlot = 0;
layout[3].AlignedByteOffset = offsetof(D3D11_Vertex, texture_index);
layout[3].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
layout[3].InstanceDataStepRate = 0;
hr = VTABLE(CreateInputLayout, d3d11_device, layout, 4, vs_buffer, vs_size, &d3d11_image_vertex_layout);
memset(layout, 0, sizeof(layout));
layout[0] = (D3D11_INPUT_ELEMENT_DESC){
offsetof(D3D11_Vertex, position),
layout[1] = (D3D11_INPUT_ELEMENT_DESC){
offsetof(D3D11_Vertex, uv),
layout[2] = (D3D11_INPUT_ELEMENT_DESC){
"COLOR", 0,
offsetof(D3D11_Vertex, color),
layout[3] = (D3D11_INPUT_ELEMENT_DESC){
offsetof(D3D11_Vertex, texture_index),
hr = VTABLE(CreateInputLayout, d3d11_device, layout, 4, vs_buffer, vs_size, &d3d11_image_vertex_layout);
log_info("D3D11 init done");
void d3d11_draw_call(int number_of_rendered_quads, ID3D11ShaderResourceView **textures, u64 num_textures) {
@ -493,7 +510,6 @@ void d3d11_draw_call(int number_of_rendered_quads, ID3D11ShaderResourceView **te
void gfx_update() {
if (window.should_close) return;
VTABLE(ClearRenderTargetView, d3d11_context, d3d11_window_render_target_view, (float*)&window.clear_color);
@ -501,59 +517,61 @@ void gfx_update() {
// purge garbage
for (u64 i = 0; i < draw_frame.garbage_stack_count; i++) {
ID3D11ShaderResourceView *view = draw_frame.garbage_stack[i];
ID3D11Resource *resource = 0;
VTABLE(GetResource, view, &resource);
ID3D11Texture2D *texture = 0;
hr = VTABLE(QueryInterface, resource, &IID_ID3D11Texture2D, (void**)&texture);
if (SUCCEEDED(hr)) {
log("Destroyed an image");
} else {
panic("Unhandled D3D11 resource deletion");
tm_scope_cycles("Frame setup") {
// purge garbage
for (u64 i = 0; i < draw_frame.garbage_stack_count; i++) {
ID3D11ShaderResourceView *view = draw_frame.garbage_stack[i];
ID3D11Resource *resource = 0;
VTABLE(GetResource, view, &resource);
ID3D11Texture2D *texture = 0;
hr = VTABLE(QueryInterface, resource, &IID_ID3D11Texture2D, (void**)&texture);
if (SUCCEEDED(hr)) {
log("Destroyed an image");
} else {
panic("Unhandled D3D11 resource deletion");
// Maybe resize swap chain
RECT client_rect;
bool ok = GetClientRect(window._os_handle, &client_rect);
assert(ok, "GetClientRect failed with error code %lu", GetLastError());
u32 window_width = client_rect.right-client_rect.left;
u32 window_height = client_rect.bottom-client_rect.top;
if (window_width != d3d11_swap_chain_width || window_height != d3d11_swap_chain_height) {
// Maybe grow quad vbo
u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6;
if (required_size > d3d11_quad_vbo_size) {
if (d3d11_quad_vbo) {
dealloc(get_heap_allocator(), d3d11_staging_quad_buffer);
desc.Usage = D3D11_USAGE_DYNAMIC;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
desc.ByteWidth = required_size;
desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
HRESULT hr = VTABLE(CreateBuffer, d3d11_device, &desc, 0, &d3d11_quad_vbo);
assert(SUCCEEDED(hr), "CreateBuffer failed");
d3d11_quad_vbo_size = required_size;
d3d11_staging_quad_buffer = alloc(get_heap_allocator(), d3d11_quad_vbo_size);
assert((u64)d3d11_staging_quad_buffer%16 == 0);
log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size);
// Maybe resize swap chain
RECT client_rect;
bool ok = GetClientRect(window._os_handle, &client_rect);
assert(ok, "GetClientRect failed with error code %lu", GetLastError());
u32 window_width = client_rect.right-client_rect.left;
u32 window_height = client_rect.bottom-client_rect.top;
if (window_width != d3d11_swap_chain_width || window_height != d3d11_swap_chain_height) {
// Maybe grow quad vbo
u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6;
if (required_size > d3d11_quad_vbo_size) {
if (d3d11_quad_vbo) {
dealloc(get_heap_allocator(), d3d11_staging_quad_buffer);
desc.Usage = D3D11_USAGE_DYNAMIC;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
desc.ByteWidth = required_size;
desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
HRESULT hr = VTABLE(CreateBuffer, d3d11_device, &desc, 0, &d3d11_quad_vbo);
assert(SUCCEEDED(hr), "CreateBuffer failed");
d3d11_quad_vbo_size = required_size;
d3d11_staging_quad_buffer = alloc(get_heap_allocator(), d3d11_quad_vbo_size);
log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size);
f64 rest_before = os_get_current_time_in_seconds();
if (draw_frame.num_blocks > 0) {
// Render geometry from into vbo quad list
@ -569,8 +587,8 @@ void gfx_update() {
Draw_Quad_Block *block = &first_block;
tm_scope_cycles("Quad processing") {
while (block != 0 && block->num_quads > 0) tm_scope_cycles("ad2As") {
for (u64 i = 0; i < block->num_quads; i++) tm_scope_cycles("Single quad") {
while (block != 0 && block->num_quads > 0) tm_scope_cycles("Quad block") {
for (u64 i = 0; i < block->num_quads; i++) {
Draw_Quad *q = &block->quad_buffer[i];
@ -620,7 +638,7 @@ void gfx_update() {
if (num_textures >= 32) {
// If max textures reached, make a draw call and start over
D3D11_MAPPED_SUBRESOURCE buffer_mapping;
VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_NO_OVERWRITE, 0, &buffer_mapping);
VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping);
memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
@ -676,30 +694,29 @@ void gfx_update() {
D3D11_MAPPED_SUBRESOURCE buffer_mapping;
VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_NO_OVERWRITE, 0, &buffer_mapping);
memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
tm_scope_cycles("Write to gpu") {
D3D11_MAPPED_SUBRESOURCE buffer_mapping;
tm_scope_cycles("The Map call") {
hr = VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping);
tm_scope_cycles("The memcpy") {
memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
tm_scope_cycles("The Unmap call") {
VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
// Draw call
u64 before_draw = os_get_current_cycle_count();
d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
u64 after_draw = os_get_current_cycle_count();
//log("Draw call took %llu cycles", after_draw-before_draw);
tm_scope_cycles("Draw call") d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
f64 rest_after = os_get_current_time_in_seconds();
if (is_key_just_pressed('E'))
log("The rest took %.2fms", (rest_after-rest_before)*1000.0);
f64 before_present = os_get_current_time_in_seconds();
hr = VTABLE(Present, d3d11_swap_chain, gfx._can_vsync && gfx.enable_vsync, 0);
f64 after = os_get_current_time_in_seconds();
if (is_key_just_pressed('E'))
log("Present took %.2fms", (after-before_present)*1000.0);
tm_scope_cycles("Present") {
hr = VTABLE(Present, d3d11_swap_chain, window.enable_vsync, window.enable_vsync ? 0 : DXGI_PRESENT_ALLOW_TEARING);

View file

@ -15,18 +15,6 @@
#error "Unknown renderer GFX_RENDERER defined"
typedef struct Gfx_State {
// config
bool enable_vsync;
// readonly
bool _can_vsync;
} Gfx_State;
Gfx_State gfx;
forward_global const Gfx_Handle GFX_INVALID_HANDLE;
typedef struct Gfx_Image {

View file

@ -133,7 +133,7 @@ inline float v3_dot_product(Vector3 a, Vector3 b) {
return simd_dot_product_float32_96((float*)&a, (float*)&b);
inline float v4_dot_product(Vector4 a, Vector4 b) {
return simd_dot_product_float32_128((float*)&a, (float*)&b);
return simd_dot_product_float32_128_aligned((float*)&a, (float*)&b);
Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) {

View file

@ -24,7 +24,7 @@ void* initialization_allocator_proc(u64 size, void *p, Allocator_Message message
if (init_memory_head >= ((u8*)init_memory_arena+INIT_MEMORY_SIZE)) {
os_write_string_to_stdout(STR("Out of initialization memory! Please provide more by increasing INIT_MEMORY_SIZE"));
return p;

View file

@ -145,23 +145,22 @@ typedef u8 bool;
#warning "Compiler is not explicitly supported, some things will probably not work as expected"
#include "cpu.c"
#define DEBUG 0
#define VERY_DEBUG 1
#define RELEASE 2
#if !defined(CONFIGURATION)
#if defined(NDEBUG)
#if defined(NDEBUG)
#include "cpu.c"
#ifndef ENTRY_PROC
#define ENTRY_PROC entry

View file

@ -135,6 +135,8 @@ LRESULT CALLBACK win32_window_proc(HWND passed_window, UINT message, WPARAM wpar
void os_init(u64 program_memory_size) {
memset(&window, 0, sizeof(window));
SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);

View file

@ -61,36 +61,6 @@ inline int crt_vprintf(const char* fmt, va_list args) {
return os.crt_vprintf(fmt, args);
inline void* naive_memcpy(void* dest, const void* source, size_t size) {
for (u64 i = 0; i < (u64)size; i++) ((u8*)dest)[i] = ((u8*)source)[i];
return dest;
inline void* memcpy(void* dest, const void* source, size_t size) {
if (!os.crt_memcpy) return naive_memcpy(dest, source, size);
return os.crt_memcpy(dest, source, size);
inline int naive_memcmp(const void* a, const void* b, size_t amount) {
// I don't understand the return value of memcmp but I also dont care
for (u64 i = 0; i < (u64)amount; i++) {
if (((u8*)a)[i] != ((u8*)b)[i]) return -1;
return 0;
inline int memcmp(const void* a, const void* b, size_t amount) {
if (!os.crt_memcmp) return naive_memcmp(a, b, amount);
return os.crt_memcmp(a, b, amount);
inline void* naive_memset(void* dest, int value, size_t amount) {
for (u64 i = 0; i < (u64)amount; i++) ((u8*)dest)[i] = (u8)value;
return dest;
inline void* memset(void* dest, int value, size_t amount) {
if (!os.crt_memset) return naive_memset(dest, value, amount);
return os.crt_memset(dest, value, amount);
inline bool bytes_match(void *a, void *b, u64 count) { return memcmp(a, b, count) == 0; }
inline int vsnprintf(char* buffer, size_t n, const char* fmt, va_list args) {
@ -333,6 +303,7 @@ typedef struct Os_Window {
u32 x;
u32 y;
Vector4 clear_color;
bool enable_vsync;
bool should_close;

View file

@ -30,6 +30,16 @@ inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result);
inline float basic_dot_product_float32_64(float *a, float *b);
inline float basic_dot_product_float32_96(float *a, float *b);
inline float basic_dot_product_float32_128(float *a, float *b);
inline void basic_sqrt_float32_64(float *a, float *result);
inline void basic_sqrt_float32_96(float *a, float *result);
inline void basic_sqrt_float32_128(float *a, float *result);
inline void basic_sqrt_float32_256(float *a, float *result);
inline void basic_sqrt_float32_512(float *a, float *result);
inline void basic_rsqrt_float32_64(float *a, float *result);
inline void basic_rsqrt_float32_96(float *a, float *result);
inline void basic_rsqrt_float32_128(float *a, float *result);
inline void basic_rsqrt_float32_256(float *a, float *result);
inline void basic_rsqrt_float32_512(float *a, float *result);
@ -123,6 +133,52 @@ inline void simd_div_float32_128_aligned(float *a, float *b, float* result) {
__m128 vr = _mm_div_ps(va, vb);
_mm_store_ps(result, vr);
inline void simd_sqrt_float32_96(float *a, float *result) {
__m128 va = _mm_loadu_ps(a);
va = _mm_and_ps(va, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); // Mask last element
__m128 vr = _mm_sqrt_ps(va);
_mm_storeu_ps(result, vr);
inline void simd_rsqrt_float32_96(float *a, float *result) {
__m128 va = _mm_loadu_ps(a);
va = _mm_and_ps(va, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); // Mask last element
__m128 vr = _mm_rsqrt_ps(va);
_mm_storeu_ps(result, vr);
inline void simd_sqrt_float32_64(float *a, float *result) {
__m128 va = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a);
__m128 vr = _mm_sqrt_ps(va);
_mm_storel_pi((__m64*)result, vr);
inline void simd_rsqrt_float32_64(float *a, float *result) {
__m128 va = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a);
__m128 vr = _mm_rsqrt_ps(va);
_mm_storel_pi((__m64*)result, vr);
inline void simd_sqrt_float32_128(float *a, float *result) {
__m128 va = _mm_loadu_ps(a);
__m128 vr = _mm_sqrt_ps(va);
_mm_storeu_ps(result, vr);
inline void simd_rsqrt_float32_128(float *a, float *result) {
__m128 va = _mm_loadu_ps(a);
__m128 vr = _mm_rsqrt_ps(va);
_mm_storeu_ps(result, vr);
inline void simd_sqrt_float32_128_aligned(float *a, float *result) {
__m128 va = _mm_load_ps(a);
__m128 vr = _mm_sqrt_ps(va);
_mm_store_ps(result, vr);
inline void simd_rsqrt_float32_128_aligned(float *a, float *result) {
__m128 va = _mm_load_ps(a);
__m128 vr = _mm_rsqrt_ps(va);
_mm_store_ps(result, vr);
@ -191,14 +247,6 @@ inline float simd_dot_product_float32_96(float *a, float *b) {
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
return _mm_cvtss_f32(dot_product);
inline float simd_dot_product_float32_96_aligned(float *a, float *b) {
__m128 vec1 = _mm_load_ps(a);
__m128 vec2 = _mm_load_ps(b);
vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
return _mm_cvtss_f32(dot_product);
inline float simd_dot_product_float32_128(float *a, float *b) {
__m128 vec1 = _mm_loadu_ps(a);
__m128 vec2 = _mm_loadu_ps(b);
@ -217,8 +265,6 @@ inline float simd_dot_product_float32_128_aligned(float *a, float *b) {
#define simd_dot_product_float32_64 basic_dot_product_float32_64
#define simd_dot_product_float32_96 basic_dot_product_float32_96
#define simd_dot_product_float32_128 basic_dot_product_float32_128
#define simd_dot_product_float32_64_aligned basic_dot_product_float32_64
#define simd_dot_product_float32_96_aligned basic_dot_product_float32_96
#define simd_dot_product_float32_128_aligned basic_dot_product_float32_128
#endif // SIMD_ENABLE_SSE41
@ -275,16 +321,41 @@ inline void simd_div_float32_256_aligned(float32 *a, float32 *b, float32* result
__m256 vr = _mm256_div_ps(va, vb);
_mm256_store_ps(result, vr);
inline void simd_sqrt_float32_256(float *a, float *result) {
__m256 va = _mm256_loadu_ps(a);
__m256 vr = _mm256_sqrt_ps(va);
_mm256_storeu_ps(result, vr);
inline void simd_rsqrt_float32_256(float *a, float *result) {
__m256 va = _mm256_loadu_ps(a);
__m256 vr = _mm256_rsqrt_ps(va);
_mm256_storeu_ps(result, vr);
inline void simd_sqrt_float32_256_aligned(float *a, float *result) {
__m256 va = _mm256_load_ps(a);
__m256 vr = _mm256_sqrt_ps(va);
_mm256_store_ps(result, vr);
inline void simd_rsqrt_float32_256_aligned(float *a, float *result) {
__m256 va = _mm256_load_ps(a);
__m256 vr = _mm256_rsqrt_ps(va);
_mm256_store_ps(result, vr);
#define simd_add_float32_256 basic_add_float32_256
#define simd_sub_float32_256 basic_sub_float32_256
#define simd_mul_float32_256 basic_mul_float32_256
#define simd_div_float32_256 basic_div_float32_256
#define simd_sqrt_float32_256 basic_sqrt_float32_256
#define simd_rsqrt_float32_256 basic_rsqrt_float32_256
#define simd_add_float32_256_aligned basic_add_float32_256
#define simd_sub_float32_256_aligned basic_sub_float32_256
#define simd_mul_float32_256_aligned basic_mul_float32_256
#define simd_div_float32_256_aligned basic_div_float32_256
#define simd_sqrt_float32_256_aligned basic_sqrt_float32_256
#define simd_rsqrt_float32_256_aligned basic_rsqrt_float32_256
@ -332,7 +403,6 @@ inline void simd_mul_int32_256_aligned(s32 *a, s32 *b, s32* result) {
#define simd_add_int32_256 basic_add_int32_256
#define simd_sub_int32_256 basic_sub_int32_256
#define simd_mul_int32_256 basic_mul_int32_256
#define simd_add_int32_256_aligned basic_add_int32_256
#define simd_sub_int32_256_aligned basic_sub_int32_256
#define simd_mul_int32_256_aligned basic_mul_int32_256
@ -432,6 +502,28 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
__m512i vr = _mm512_mullo_epi32(va, vb);
_mm512_store_si512((__m512i*)result, vr);
inline void simd_sqrt_float32_512(float *a, float *result) {
__m512 va = _mm512_loadu_ps(a);
__m512 vr = _mm512_sqrt_ps(va);
_mm512_storeu_ps(result, vr);
inline void simd_rsqrt_float32_512(float *a, float *result) {
__m512 va = _mm512_loadu_ps(a);
__m512 vr = _mm512_rsqrt14_ps(va); // AVX-512 does not have _mm512_rsqrt_ps
_mm512_storeu_ps(result, vr);
inline void simd_sqrt_float32_512_aligned(float *a, float *result) {
__m512 va = _mm512_load_ps(a);
__m512 vr = _mm512_sqrt_ps(va);
_mm512_store_ps(result, vr);
inline void simd_rsqrt_float32_512_aligned(float *a, float *result) {
__m512 va = _mm512_load_ps(a);
__m512 vr = _mm512_rsqrt14_ps(va);
_mm512_store_ps(result, vr);
#define simd_add_float32_512 basic_add_float32_512
#define simd_sub_float32_512 basic_sub_float32_512
@ -440,7 +532,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512 basic_add_int32_512
#define simd_sub_int32_512 basic_sub_int32_512
#define simd_mul_int32_512 basic_mul_int32_512
#define simd_sqrt_float32_512 basic_sqrt_float32_512
#define simd_rsqrt_float32_512 basic_rsqrt_float32_512
#define simd_add_float32_512_aligned basic_add_float32_512
#define simd_sub_float32_512_aligned basic_sub_float32_512
#define simd_mul_float32_512_aligned basic_mul_float32_512
@ -448,6 +541,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512_aligned basic_add_int32_512
#define simd_sub_int32_512_aligned basic_sub_int32_512
#define simd_mul_int32_512_aligned basic_mul_int32_512
#define simd_sqrt_float32_512_aligned basic_sqrt_float32_512
#define simd_rsqrt_float32_512_aligned basic_rsqrt_float32_512
#endif // SIMD_ENABLE_AVX512
@ -461,10 +556,16 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_mul_float32_128 basic_mul_float32_128
#define simd_div_float32_64 basic_div_float32_64
#define simd_div_float32_128 basic_div_float32_128
#define simd_sqrt_float32_64 basic_sqrt_float32_64
#define simd_sqrt_float32_128 basic_sqrt_float32_128
#define simd_rsqrt_float32_64 basic_rsqrt_float32_64
#define simd_rsqrt_float32_128 basic_rsqrt_float32_128
#define simd_add_float32_128_aligned basic_add_float32_128
#define simd_sub_float32_128_aligned basic_sub_float32_128
#define simd_mul_float32_128_aligned basic_mul_float32_128
#define simd_div_float32_128_aligned basic_div_float32_128
#define simd_sqrt_float32_128_aligned basic_sqrt_float32_128
#define simd_rsqrt_float32_128_aligned basic_rsqrt_float32_128
// SSE2
#define simd_add_int32_128 basic_add_int32_128
@ -475,19 +576,26 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_mul_int32_128_aligned basic_mul_int32_128
// SSE41
#define simd_mul_int32_128 basic_mul_int32_128
#define simd_mul_int32_128_aligned basic_mul_int32_128
#define simd_dot_product_float32_64 basic_dot_product_float32_64
#define simd_dot_product_float32_96 basic_dot_product_float32_96
#define simd_dot_product_float32_128 basic_dot_product_float32_128
#define simd_dot_product_float32_128_aligned basic_dot_product_float32_128
// AVX
#define simd_add_float32_256 basic_add_float32_256
#define simd_sub_float32_256 basic_sub_float32_256
#define simd_mul_float32_256 basic_mul_float32_256
#define simd_div_float32_256 basic_div_float32_256
#define simd_sqrt_float32_256 basic_sqrt_float32_256
#define simd_rsqrt_float32_256 basic_rsqrt_float32_256
#define simd_add_float32_256_aligned basic_add_float32_256
#define simd_sub_float32_256_aligned basic_sub_float32_256
#define simd_mul_float32_256_aligned basic_mul_float32_256
#define simd_div_float32_256_aligned basic_div_float32_256
#define simd_sqrt_float32_256_aligned basic_sqrt_float32_256
#define simd_rsqrt_float32_256_aligned basic_rsqrt_float32_256
// AVX2
#define simd_add_int32_256 basic_add_int32_256
@ -505,6 +613,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512 basic_add_int32_512
#define simd_sub_int32_512 basic_sub_int32_512
#define simd_mul_int32_512 basic_mul_int32_512
#define simd_sqrt_float32_512 basic_sqrt_float32_512
#define simd_rsqrt_float32_512 basic_rsqrt_float32_512
#define simd_add_float32_512_aligned basic_add_float32_512
#define simd_sub_float32_512_aligned basic_sub_float32_512
#define simd_mul_float32_512_aligned basic_mul_float32_512
@ -512,9 +622,14 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512_aligned basic_add_int32_512
#define simd_sub_int32_512_aligned basic_sub_int32_512
#define simd_mul_int32_512_aligned basic_mul_int32_512
#define simd_sqrt_float32_512_aligned basic_sqrt_float32_512
#define simd_rsqrt_float32_512_aligned basic_rsqrt_float32_512
double __cdecl sqrt(_In_ double _X);
double __cdecl rsqrt(_In_ double _X);
inline void basic_add_float32_64 (float32 *a, float32 *b, float32* result) {
result[0] = a[0] + b[0];
result[1] = a[1] + b[1];
@ -638,6 +753,55 @@ inline float basic_dot_product_float32_96(float *a, float *b) {
inline float basic_dot_product_float32_128(float *a, float *b) {
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
inline void basic_sqrt_float32_64(float *a, float *result) {
result[0] = sqrt(a[0]);
result[1] = sqrt(a[1]);
inline void basic_sqrt_float32_96(float *a, float *result) {
result[0] = sqrt(a[0]);
result[1] = sqrt(a[1]);
result[2] = sqrt(a[2]);
inline void basic_sqrt_float32_128(float *a, float *result) {
result[0] = sqrt(a[0]);
result[1] = sqrt(a[1]);
result[2] = sqrt(a[2]);
result[3] = sqrt(a[3]);
inline void basic_sqrt_float32_256(float *a, float *result) {
basic_sqrt_float32_128(a, result);
basic_sqrt_float32_128(a+4, result+4);
inline void basic_sqrt_float32_512(float *a, float *result) {
basic_sqrt_float32_256(a, result);
basic_sqrt_float32_256(a+8, result+8);
inline void basic_rsqrt_float32_64(float *a, float *result) {
result[0] = rsqrt(a[0]);
result[1] = rsqrt(a[1]);
inline void basic_rsqrt_float32_96(float *a, float *result) {
result[0] = rsqrt(a[0]);
result[1] = rsqrt(a[1]);
result[2] = rsqrt(a[2]);
inline void basic_rsqrt_float32_128(float *a, float *result) {
result[0] = rsqrt(a[0]);
result[1] = rsqrt(a[1]);
result[2] = rsqrt(a[2]);
result[3] = rsqrt(a[3]);
inline void basic_rsqrt_float32_256(float *a, float *result) {
basic_rsqrt_float32_128(a, result);
basic_rsqrt_float32_128(a+4, result+4);
inline void basic_rsqrt_float32_512(float *a, float *result) {
basic_rsqrt_float32_256(a, result);
basic_rsqrt_float32_256(a+8, result+8);

View file

@ -5,7 +5,6 @@
void * memcpy (void *,const void *,size_t);
void* talloc(u64);
typedef struct string {
@ -13,6 +12,7 @@ typedef struct string {
u8 *data;
} string;
#define fixed_string STR
#define STR(s) ((string){ length_of_null_terminated_string((const char*)s), (u8*)s })
inline u64 length_of_null_terminated_string(const char* cstring) {

View file

@ -213,7 +213,7 @@ void printf(const char* fmt, ...) {
typedef void(*Logger_Proc)(Log_Level level, string s);
#define LOG_BASE(level, ...) If context.logger then ((Logger_Proc)context.logger)(level, tprint(__VA_ARGS__))
#define LOG_BASE(level, ...) if (context.logger) ((Logger_Proc)context.logger)(level, tprint(__VA_ARGS__))
#define log_verbose(...) LOG_BASE(LOG_VERBOSE, __VA_ARGS__)

View file

@ -1,6 +1,6 @@
// Custom allocators for lodepng
Allocator get_heap_allocator();
Allocator lodepng_allocator = {0};
/*Allocator lodepng_allocator = {0};
void* lodepng_malloc(size_t size) {
if(size > LODEPNG_MAX_ALLOC) return 0;
@ -28,4 +28,47 @@ void lodepng_free(void* ptr) {
// One day I might write my own png decoder so we don't even need this
#include "third_party/lodepng.h"
#include "third_party/lodepng.c"
#include "third_party/lodepng.c"*/
typedef unsigned char u8;
typedef signed char s8;
typedef unsigned short u16;
typedef signed short s16;
typedef unsigned int u32;
typedef signed int s32;
void *stbtt_malloc(size_t size) {
if (!size) return 0;
return alloc(get_heap_allocator(), size);
#define STBTT_malloc(x,u) ((void)(u),stbtt_malloc(x))
void stbtt_free(void *p) {
if (!p) return;
dealloc(get_heap_allocator(), p);
#define STBTT_free(x,u) ((void)(u),stbtt_free(x))
#define STBTT_assert(x) assert(x)
size_t stbtt_strlen(const char* str) {
size_t count = 0;
while (str[count] != 0) count += 1;
return count;
#define STBTT_strlen(x) stbtt_strlen(x)
#define STBTT_memcpy memcpy
#define STBTT_memset memset
#define STBI_ASSERT(x) {if (!(x)) *(volatile char*)0 = 0;}
#define STBI_MALLOC(sz) stbtt_malloc(sz)
#define STBI_REALLOC(p,newsz) get_heap_allocator().proc(newsz, p, ALLOCATOR_REALLOCATE, 0)
#define STBI_FREE(p) stbtt_free(p)
#include "third_party/stb_image.h"
#include "third_party/stb_truetype.h"

oogabooga/third_party/stb_image.h vendored Normal file

File diff suppressed because it is too large Load diff

oogabooga/third_party/stb_truetype.h vendored Normal file

File diff suppressed because it is too large Load diff