- Replace lodepng with stb_image (& add stb_truetype for fonts)

- Fix d3d11 Input assembler not being created correctly bug
- Fix framerate being locked by swap chain present
- Move enable_vsync to window
- sqrt & rsqrt simd
- Add release build & run in vscode tasks & launch
- Cleanup
This commit is contained in:
Charlie 2024-07-04 20:56:27 +02:00
parent 4c5f882999
commit 05919248eb
26 changed files with 13557 additions and 305 deletions

4
.gitignore vendored
View file

@ -54,4 +54,6 @@ test_doc.vkn
*keybinds
*.rdi
google_trace.json
google_trace.json
build/*

16
.vscode/launch.json vendored
View file

@ -2,7 +2,7 @@
"version": "0.2.0",
"configurations": [
{
"name": "Launch with MSVC Debugger",
"name": "Launch Debug with MSVC Debugger",
"type": "cppvsdbg",
"request": "launch",
"program": "${workspaceFolder}/build/cgame.exe", // Run the output executable after compile
@ -11,7 +11,19 @@
"cwd": "${workspaceFolder}",
"environment": [],
"console":"integratedTerminal",
// "preLaunchTask": "Compile"
"preLaunchTask": "Compile"
},
{
"name": "Launch Release with MSVC Debugger",
"type": "cppvsdbg",
"request": "launch",
"program": "${workspaceFolder}/build/release/cgame.exe", // Run the output executable after compile
"args": [],
"stopAtEntry": false,
"cwd": "${workspaceFolder}",
"environment": [],
"console":"integratedTerminal",
"preLaunchTask": "Compile Release"
}
]
}

17
.vscode/tasks.json vendored
View file

@ -16,6 +16,21 @@
// "close": false,
// "showReuseMessage": true,
}
}
},
{
"label": "Compile Release",
"type": "shell",
"command": "${workspaceFolder}\\build_release",
"group": {
"kind": "build"
},
"problemMatcher": ["$gcc"],
"presentation": {
"clear": true,
// "revealProblems": "onProblem",
// "close": false,
// "showReuseMessage": true,
}
}
]
}

View file

@ -6,6 +6,6 @@ mkdir build
pushd build
clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -msse4.1
clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations -lkernel32 -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi
popd

View file

@ -3,7 +3,7 @@
///
// Build config stuff
#define RUN_TESTS 1
#define RUN_TESTS 0
// This is only for people developing oogabooga!
#define OOGABOOGA_DEV 1
@ -13,6 +13,7 @@
// ENABLE_SIMD Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which doesn't
#define ENABLE_SIMD 1
#define INITIAL_PROGRAM_MEMORY_SIZE MB(5)
typedef struct Context_Extra {
@ -21,8 +22,6 @@ typedef struct Context_Extra {
// This needs to be defined before oogabooga if we want extra stuff in context
#define CONTEXT_EXTRA Context_Extra
#define GFX_RENDERER GFX_RENDERER_D3D11
// This defaults to "entry", but we can set it to anything (except "main" or other existing proc names"
#define ENTRY_PROC entry
@ -38,13 +37,13 @@ typedef struct Context_Extra {
//
// this is a minimal starting point for new projects. Copy & rename to get started
#include "oogabooga/examples/minimal_game_loop.c"
// #include "oogabooga/examples/minimal_game_loop.c"
// An engine dev stress test for rendering
// #include "oogabooga/examples/renderer_stress_test.c"
// Randy's example game that he's building out as a tutorial for using the engine
// #include "entry_randygame.c"
#include "entry_randygame.c"
// This is where you swap in your own project!
// #include "entry_yourepicgamename.c"

View file

@ -1,14 +1,18 @@
@echo off
rmdir /S /Q build
mkdir build
if exist build/dissassembly (
rmdir /s /q build
)
if not exist build (
mkdir build
)
pushd build
mkdir release
pushd release
mkdir dissassembly
pushd dissassembly
clang -o cgame.asm ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -ffast-math -funroll-loops -finline-functions -fvectorize -fslp-vectorize -fomit-frame-pointer -fno-exceptions -fno-rtti -S -masm=intel
clang -o cgame.asm ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -S -masm=intel
popd
popd

View file

@ -1,5 +1,7 @@
@echo off
rmdir /S /Q build
if exist build (
rmdir /s /q build
)
mkdir build
pushd build
@ -7,7 +9,7 @@ pushd build
mkdir release
pushd release
clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -msse4.1
clang -o cgame.exe ../../build.c -Ofast -DNDEBUG -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -Wno-deprecated-declarations -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions
popd
popd

View file

@ -10,37 +10,22 @@
#define local_persist static
#define forward_global extern
// Haters gonna hate
#define If if (
#define then )
// If cond then {}
#ifdef _MSC_VER
inline void os_break() {
__debugbreak();
volatile int *a = 0;
*a = 5;
}
#else
#error "Only msvc compiler supported at the moment";
#endif
void printf(const char* fmt, ...);
#define ASSERT_STR_HELPER(x) #x
#define ASSERT_STR(x) ASSERT_STR_HELPER(x)
#define assert_line(line, cond, ...) if(!(cond)) { printf("Assertion failed in file " __FILE__ " on line " ASSERT_STR(line) "\nFailed Condition: " #cond ". Message: " __VA_ARGS__); os_break(); }
#define assert(cond, ...) assert_line(__LINE__, cond, __VA_ARGS__);
#define assert_line(line, cond, ...) {if(!(cond)) { printf("Assertion failed in file " __FILE__ " on line " ASSERT_STR(line) "\nFailed Condition: " #cond ". Message: " __VA_ARGS__); crash(); }}
#define assert(cond, ...) {assert_line(__LINE__, cond, __VA_ARGS__)}
#define DEFER(start, end) for(int _i_ = ((start), 0); _i_ == 0; _i_ += 1, (end))
#if CONFIGURATION == RELEASE
#undef assert
#define assert(...)
#define assert(...) (void)0;
#endif
#define panic(...) { print(__VA_ARGS__); os_break(); }
#define panic(...) { print(__VA_ARGS__); crash(); }
#define cast(t) (t)
@ -48,7 +33,6 @@ void printf(const char* fmt, ...);
#define FIRST_ARG(arg1, ...) arg1
#define SECOND_ARG(arg1, arg2, ...) arg2
#define print(...) _Generic((FIRST_ARG(__VA_ARGS__)), \

View file

@ -29,6 +29,11 @@ typedef struct Cpu_Capabilities {
#define inline __forceinline
#define alignat(x) __declspec(align(x))
#define COMPILER_HAS_MEMCPY_INTRINSICS 1
inline void crash() {
__debugbreak();
volatile int *a = 0;
*a = 5;
}
#include <intrin.h>
#pragma intrinsic(__rdtsc)
inline u64 rdtsc() {
@ -66,6 +71,11 @@ typedef struct Cpu_Capabilities {
#define inline __attribute__((always_inline)) inline
#define alignat(x) __attribute__((aligned(x)))
#define COMPILER_HAS_MEMCPY_INTRINSICS 1
inline void crash() {
__builtin_trap();
volatile int *a = 0;
*a = 5;
}
inline u64 rdtsc() {
unsigned int lo, hi;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
@ -119,7 +129,6 @@ typedef struct Cpu_Capabilities {
#warning "Compiler is not explicitly supported, some things will probably not work as expected"
#endif
Cpu_Capabilities query_cpu_capabilities() {
Cpu_Capabilities result = {0};

View file

@ -3,10 +3,10 @@
struct VS_INPUT
{
float4 position : POSITION;
float2 uv : TEXCOORD;
float4 color : COLOR;
int texture_index: TEXTURE_INDEX;
float4 position : POSITION;
};
struct PS_INPUT
@ -79,8 +79,8 @@ float4 ps_main(PS_INPUT input) : SV_TARGET
*/
const u8 IMAGE_SHADER_VERTEX_BLOB_BYTES[]= {
0x44, 0x58, 0x42, 0x43, 0xdd, 0x02, 0x55, 0xb0, 0x7b, 0x83, 0x6c, 0x34, 0x45, 0xe8, 0x51, 0xd4,
0x76, 0xbf, 0x66, 0x77, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x03, 0x00, 0x00, 0x05, 0x00, 0x00,
0x44, 0x58, 0x42, 0x43, 0xf4, 0xea, 0x50, 0x9f, 0xcf, 0xeb, 0x01, 0x7b, 0x78, 0x58, 0xd5, 0x6b,
0x4f, 0x9f, 0xc1, 0xe2, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x03, 0x00, 0x00, 0x05, 0x00, 0x00,
0x00, 0x34, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x38, 0x01, 0x00, 0x00, 0xd4, 0x01,
0x00, 0x00, 0xa0, 0x02, 0x00, 0x00, 0x52, 0x44, 0x45, 0x46, 0x64, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
@ -92,14 +92,14 @@ const u8 IMAGE_SHADER_VERTEX_BLOB_BYTES[]= {
0x6c, 0x65, 0x72, 0x20, 0x31, 0x30, 0x2e, 0x31, 0x00, 0x49, 0x53, 0x47, 0x4e, 0x90, 0x00,
0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x03, 0x03, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00, 0x00, 0x77, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02,
0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00,
0x00, 0x54, 0x45, 0x58, 0x43, 0x4f, 0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52,
0x00, 0x54, 0x45, 0x58, 0x54, 0x55, 0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00,
0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x00, 0xab, 0xab, 0x4f, 0x53, 0x47, 0x4e,
0x0f, 0x0f, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x7a, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02,
0x00, 0x00, 0x00, 0x0f, 0x0f, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
0x00, 0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x00, 0x54, 0x45, 0x58, 0x43, 0x4f,
0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52, 0x00, 0x54, 0x45, 0x58, 0x54, 0x55,
0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00, 0xab, 0xab, 0x4f, 0x53, 0x47, 0x4e,
0x94, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -111,19 +111,19 @@ const u8 IMAGE_SHADER_VERTEX_BLOB_BYTES[]= {
0x54, 0x45, 0x58, 0x43, 0x4f, 0x4f, 0x52, 0x44, 0x00, 0x43, 0x4f, 0x4c, 0x4f, 0x52, 0x00,
0x54, 0x45, 0x58, 0x54, 0x55, 0x52, 0x45, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x00, 0xab,
0xab, 0xab, 0x53, 0x48, 0x45, 0x58, 0xc4, 0x00, 0x00, 0x00, 0x50, 0x00, 0x01, 0x00, 0x31,
0x00, 0x00, 0x00, 0x6a, 0x08, 0x00, 0x01, 0x5f, 0x00, 0x00, 0x03, 0x32, 0x10, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00,
0x00, 0x5f, 0x00, 0x00, 0x03, 0x12, 0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x5f, 0x00,
0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x04, 0xf2,
0x00, 0x00, 0x00, 0x6a, 0x08, 0x00, 0x01, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00,
0x00, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x03, 0x32, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00,
0x00, 0x5f, 0x00, 0x00, 0x03, 0xf2, 0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x5f, 0x00,
0x00, 0x03, 0x12, 0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x04, 0xf2,
0x20, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03,
0x32, 0x20, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03, 0xf2, 0x20, 0x10,
0x00, 0x02, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x03, 0x12, 0x20, 0x10, 0x00, 0x03, 0x00,
0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0xf2, 0x20, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46,
0x1e, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x32, 0x20, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00,
0x05, 0xf2, 0x20, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x1e, 0x10, 0x00, 0x01, 0x00,
0x1e, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x32, 0x20, 0x10, 0x00,
0x01, 0x00, 0x00, 0x00, 0x46, 0x10, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00,
0x05, 0xf2, 0x20, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x1e, 0x10, 0x00, 0x02, 0x00,
0x00, 0x00, 0x36, 0x00, 0x00, 0x05, 0x12, 0x20, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0a,
0x10, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x10, 0x10, 0x00, 0x03, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x01, 0x53, 0x54, 0x41, 0x54,
0x94, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,

View file

@ -1,9 +1,9 @@
struct VS_INPUT
{
float4 position : POSITION;
float2 uv : TEXCOORD;
float4 color : COLOR;
int texture_index: TEXTURE_INDEX;
float4 position : POSITION;
};
struct PS_INPUT

View file

@ -197,65 +197,39 @@ Draw_Quad *draw_image_xform(Gfx_Image *image, Matrix4 xform, Vector2 size, Vecto
#define COLOR_BLACK ((Vector4){0.0, 0.0, 0.0, 1.0})
Gfx_Image *load_image_from_disk(string path, Allocator allocator) {
string png;
bool ok = os_read_entire_file(path, &png, allocator);
if (!ok) return 0;
string png;
bool ok = os_read_entire_file(path, &png, allocator);
if (!ok) return 0;
Gfx_Image *image = alloc(allocator, sizeof(Gfx_Image));
// This is fucking terrible I gotta write my own decoder
lodepng_allocator = allocator;
LodePNGState state;
lodepng_state_init(&state);
u32 error = lodepng_inspect(&image->width, &image->height, &state, png.data, png.count);
if (error) {
return 0;
}
// 5 lines of code to say "ignore_adler32 = true" (because it's broken and gives me an error)
// I JUST WANT TO LOAD A PNG
LodePNGDecoderSettings decoder;
lodepng_decoder_settings_init(&decoder);
lodepng_decompress_settings_init(&decoder.zlibsettings);
decoder.zlibsettings.ignore_adler32 = true;
state.decoder = decoder;
error = lodepng_decode(&image->data, &image->width, &image->height, &state, png.data, png.count);
lodepng_state_cleanup(&state);
dealloc_string(allocator, png);
if (error) {
return 0;
}
// We need to flip the image
u32 row_bytes = image->width * 4; // #Magicvalue assuming 4 bytes
u8* temp_row = (u8*)alloc(temp, row_bytes);
for (u32 i = 0; i < image->height / 2; i++) {
u8* top_row = image->data + i * row_bytes;
u8* bottom_row = image->data + (image->height - i - 1) * row_bytes;
// Swap the top row with the bottom row
memcpy(temp_row, top_row, row_bytes);
memcpy(top_row, bottom_row, row_bytes);
memcpy(bottom_row, temp_row, row_bytes);
Gfx_Image *image = alloc(allocator, sizeof(Gfx_Image));
// Use stb_image to load and decode the PNG
int width, height, channels;
stbi_set_flip_vertically_on_load(1); // stb_image can flip the image on load
unsigned char* stb_data = stbi_load_from_memory(png.data, png.count, &width, &height, &channels, STBI_rgb_alpha);
if (!stb_data) {
dealloc(allocator, image);
dealloc_string(allocator, png);
return 0;
}
image->gfx_handle = GFX_INVALID_HANDLE; // This is handled in gfx
image->allocator = allocator;
return image;
image->data = stb_data;
image->width = width;
image->height = height;
image->gfx_handle = GFX_INVALID_HANDLE; // This is handled in gfx
image->allocator = allocator;
dealloc_string(allocator, png);
return image;
}
void delete_image(Gfx_Image *image) {
dealloc(image->allocator, image->data);
image->width = 0;
image->height = 0;
draw_frame.garbage_stack[draw_frame.garbage_stack_count] = image->gfx_handle;
draw_frame.garbage_stack_count += 1;
dealloc(image->allocator, image);
stbi_image_free(image->data); // Free the image data allocated by stb_image
image->width = 0;
image->height = 0;
draw_frame.garbage_stack[draw_frame.garbage_stack_count] = image->gfx_handle;
draw_frame.garbage_stack_count += 1;
dealloc(image->allocator, image);
}

View file

@ -15,8 +15,6 @@ int entry(int argc, char **argv) {
Gfx_Image *hammer_image = load_image_from_disk(STR("oogabooga/examples/hammer.png"), get_heap_allocator());
assert(hammer_image, "Failed loading hammer.png");
Gfx_Font *font = load_font_From_disk(
seed_for_random = os_get_current_cycle_count();
const float64 fps_limit = 69000;
@ -36,7 +34,9 @@ int entry(int argc, char **argv) {
delta = now - last_time;
}
last_time = now;
os_update();
tm_scope_cycles("os_update") {
os_update();
}
if (is_key_just_released(KEY_ESCAPE)) {
window.should_close = true;
@ -102,11 +102,10 @@ int entry(int argc, char **argv) {
draw_image(bush_image, v2(0.65, 0.65), v2(0.2*sin(now), 0.2*sin(now)), COLOR_WHITE);
draw_frame.font = STR("");
tm_scope_cycles("gfx_update") {
gfx_update();
}
draw_text();
gfx_update();
if (is_key_just_released('E')) {
log("FPS: %.2f", 1.0 / delta);

View file

@ -13,10 +13,10 @@ const Gfx_Handle GFX_INVALID_HANDLE = 0;
string temp_win32_null_terminated_wide_to_fixed_utf8(const u16 *utf16);
typedef struct D3D11_Vertex {
typedef struct alignat(16) D3D11_Vertex {
Vector4 color;
Vector4 position;
Vector2 uv;
Vector4 color;
int texture_index;
} D3D11_Vertex;
@ -81,14 +81,19 @@ void CALLBACK d3d11_debug_callback(D3D11_MESSAGE_CATEGORY category, D3D11_MESSAG
case D3D11_MESSAGE_SEVERITY_CORRUPTION:
case D3D11_MESSAGE_SEVERITY_ERROR:
log_error(msg);
break;
case D3D11_MESSAGE_SEVERITY_WARNING:
log_warning(msg);
break;
case D3D11_MESSAGE_SEVERITY_INFO:
log_info(msg);
break;
case D3D11_MESSAGE_SEVERITY_MESSAGE:
log_verbose(msg);
break;
default:
log("Ligma");
break;
}
}
@ -127,7 +132,8 @@ void d3d11_update_swapchain() {
if (create) {
DXGI_SWAP_CHAIN_DESC1 scd = ZERO(DXGI_SWAP_CHAIN_DESC1);
scd.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
//scd.BufferDesc.RefreshRate.Numerator = xx st.refresh_rate;
scd.Flags = DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING;
//scd.BufferDesc.RefreshRate.Numerator = 0;
//scd.BufferDesc.RefreshRate.Denominator = 1;
scd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
@ -137,23 +143,23 @@ void d3d11_update_swapchain() {
scd.Scaling = DXGI_SCALING_STRETCH; // for compatability with 7
}
// Windows 10 allows to use DXGI_SWAP_EFFECT_FLIP_DISCARD
// for Windows 8 compatibility use DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL
// for Windows 7 compatibility use DXGI_SWAP_EFFECT_DISCARD
if (d3d11_feature_level >= D3D_FEATURE_LEVEL_11_0) {
// this is supported only on FLIP presentation model
scd.Scaling = DXGI_SCALING_NONE;
scd.SwapEffect = DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL;
scd.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD;
scd.BufferCount = 3;
gfx._can_vsync = false;
log_verbose("Present mode is flip discard, 3 buffers");
} else {
scd.SwapEffect = DXGI_SWAP_EFFECT_DISCARD;
scd.BufferCount = 2;
gfx._can_vsync = true;
log_verbose("Present mode is discard, 2 buffers");
}
// Obtain DXGI factory from device
IDXGIDevice *dxgi_device;
hr = VTABLE(QueryInterface, d3d11_device, &IID_IDXGIDevice, cast(void**)&dxgi_device);
@ -224,7 +230,7 @@ void d3d11_update_swapchain() {
void gfx_init() {
gfx.enable_vsync = false;
window.enable_vsync = false;
log_verbose("d3d11 gfx_init");
@ -426,42 +432,53 @@ void gfx_init() {
log_verbose("Shaders created");
D3D11_INPUT_ELEMENT_DESC layout[4];
memset(layout, 0, sizeof(layout));
layout[0].SemanticName = "POSITION";
layout[0].SemanticIndex = 0;
layout[0].Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
layout[0].InputSlot = 0;
layout[0].AlignedByteOffset = offsetof(D3D11_Vertex, position);
layout[0].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
layout[0].InstanceDataStepRate = 0;
layout[1].SemanticName = "TEXCOORD";
layout[1].SemanticIndex = 0;
layout[1].Format = DXGI_FORMAT_R32G32_FLOAT;
layout[1].InputSlot = 0;
layout[1].AlignedByteOffset = offsetof(D3D11_Vertex, uv);
layout[1].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
layout[1].InstanceDataStepRate = 0;
layout[2].SemanticName = "COLOR";
layout[2].SemanticIndex = 0;
layout[2].Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
layout[2].InputSlot = 0;
layout[2].AlignedByteOffset = offsetof(D3D11_Vertex, color);
layout[2].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
layout[2].InstanceDataStepRate = 0;
layout[3].SemanticName = "TEXTURE_INDEX";
layout[3].SemanticIndex = 0;
layout[3].Format = DXGI_FORMAT_R32_SINT;
layout[3].InputSlot = 0;
layout[3].AlignedByteOffset = offsetof(D3D11_Vertex, texture_index);
layout[3].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
layout[3].InstanceDataStepRate = 0;
hr = VTABLE(CreateInputLayout, d3d11_device, layout, 4, vs_buffer, vs_size, &d3d11_image_vertex_layout);
win32_check_hr(hr);
#if OOGABOOGA_DEV
D3D11Release(vs_blob);
D3D11Release(ps_blob);
#endif
D3D11_INPUT_ELEMENT_DESC layout[4];
memset(layout, 0, sizeof(layout));
layout[0] = (D3D11_INPUT_ELEMENT_DESC){
"POSITION", 0,
DXGI_FORMAT_R32G32B32A32_FLOAT, 0,
offsetof(D3D11_Vertex, position),
D3D11_INPUT_PER_VERTEX_DATA, 0
};
layout[1] = (D3D11_INPUT_ELEMENT_DESC){
"TEXCOORD", 0,
DXGI_FORMAT_R32G32_FLOAT, 0,
offsetof(D3D11_Vertex, uv),
D3D11_INPUT_PER_VERTEX_DATA, 0
};
layout[2] = (D3D11_INPUT_ELEMENT_DESC){
"COLOR", 0,
DXGI_FORMAT_R32G32B32A32_FLOAT, 0,
offsetof(D3D11_Vertex, color),
D3D11_INPUT_PER_VERTEX_DATA, 0
};
layout[3] = (D3D11_INPUT_ELEMENT_DESC){
"TEXTURE_INDEX", 0,
DXGI_FORMAT_R32_SINT, 0,
offsetof(D3D11_Vertex, texture_index),
D3D11_INPUT_PER_VERTEX_DATA, 0
};
hr = VTABLE(CreateInputLayout, d3d11_device, layout, 4, vs_buffer, vs_size, &d3d11_image_vertex_layout);
log_info("D3D11 init done");
}
void d3d11_draw_call(int number_of_rendered_quads, ID3D11ShaderResourceView **textures, u64 num_textures) {
@ -493,7 +510,6 @@ void d3d11_draw_call(int number_of_rendered_quads, ID3D11ShaderResourceView **te
}
void gfx_update() {
if (window.should_close) return;
VTABLE(ClearRenderTargetView, d3d11_context, d3d11_window_render_target_view, (float*)&window.clear_color);
@ -501,59 +517,61 @@ void gfx_update() {
HRESULT hr;
///
// purge garbage
for (u64 i = 0; i < draw_frame.garbage_stack_count; i++) {
ID3D11ShaderResourceView *view = draw_frame.garbage_stack[i];
ID3D11Resource *resource = 0;
VTABLE(GetResource, view, &resource);
ID3D11Texture2D *texture = 0;
hr = VTABLE(QueryInterface, resource, &IID_ID3D11Texture2D, (void**)&texture);
if (SUCCEEDED(hr)) {
D3D11Release(view);
D3D11Release(texture);
log("Destroyed an image");
} else {
panic("Unhandled D3D11 resource deletion");
tm_scope_cycles("Frame setup") {
///
// purge garbage
for (u64 i = 0; i < draw_frame.garbage_stack_count; i++) {
ID3D11ShaderResourceView *view = draw_frame.garbage_stack[i];
ID3D11Resource *resource = 0;
VTABLE(GetResource, view, &resource);
ID3D11Texture2D *texture = 0;
hr = VTABLE(QueryInterface, resource, &IID_ID3D11Texture2D, (void**)&texture);
if (SUCCEEDED(hr)) {
D3D11Release(view);
D3D11Release(texture);
log("Destroyed an image");
} else {
panic("Unhandled D3D11 resource deletion");
}
}
///
// Maybe resize swap chain
RECT client_rect;
bool ok = GetClientRect(window._os_handle, &client_rect);
assert(ok, "GetClientRect failed with error code %lu", GetLastError());
u32 window_width = client_rect.right-client_rect.left;
u32 window_height = client_rect.bottom-client_rect.top;
if (window_width != d3d11_swap_chain_width || window_height != d3d11_swap_chain_height) {
d3d11_update_swapchain();
}
///
// Maybe grow quad vbo
u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6;
if (required_size > d3d11_quad_vbo_size) {
if (d3d11_quad_vbo) {
D3D11Release(d3d11_quad_vbo);
dealloc(get_heap_allocator(), d3d11_staging_quad_buffer);
}
D3D11_BUFFER_DESC desc = ZERO(D3D11_BUFFER_DESC);
desc.Usage = D3D11_USAGE_DYNAMIC;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
desc.ByteWidth = required_size;
desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
HRESULT hr = VTABLE(CreateBuffer, d3d11_device, &desc, 0, &d3d11_quad_vbo);
assert(SUCCEEDED(hr), "CreateBuffer failed");
d3d11_quad_vbo_size = required_size;
d3d11_staging_quad_buffer = alloc(get_heap_allocator(), d3d11_quad_vbo_size);
assert((u64)d3d11_staging_quad_buffer%16 == 0);
log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size);
}
}
///
// Maybe resize swap chain
RECT client_rect;
bool ok = GetClientRect(window._os_handle, &client_rect);
assert(ok, "GetClientRect failed with error code %lu", GetLastError());
u32 window_width = client_rect.right-client_rect.left;
u32 window_height = client_rect.bottom-client_rect.top;
if (window_width != d3d11_swap_chain_width || window_height != d3d11_swap_chain_height) {
d3d11_update_swapchain();
}
///
// Maybe grow quad vbo
u32 required_size = sizeof(D3D11_Vertex) * draw_frame.num_blocks*QUADS_PER_BLOCK*6;
if (required_size > d3d11_quad_vbo_size) {
if (d3d11_quad_vbo) {
D3D11Release(d3d11_quad_vbo);
dealloc(get_heap_allocator(), d3d11_staging_quad_buffer);
}
D3D11_BUFFER_DESC desc = ZERO(D3D11_BUFFER_DESC);
desc.Usage = D3D11_USAGE_DYNAMIC;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
desc.ByteWidth = required_size;
desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
HRESULT hr = VTABLE(CreateBuffer, d3d11_device, &desc, 0, &d3d11_quad_vbo);
assert(SUCCEEDED(hr), "CreateBuffer failed");
d3d11_quad_vbo_size = required_size;
d3d11_staging_quad_buffer = alloc(get_heap_allocator(), d3d11_quad_vbo_size);
log_verbose("Grew quad vbo to %d bytes.", d3d11_quad_vbo_size);
}
f64 rest_before = os_get_current_time_in_seconds();
if (draw_frame.num_blocks > 0) {
///
// Render geometry from into vbo quad list
@ -569,8 +587,8 @@ void gfx_update() {
Draw_Quad_Block *block = &first_block;
tm_scope_cycles("Quad processing") {
while (block != 0 && block->num_quads > 0) tm_scope_cycles("ad2As") {
for (u64 i = 0; i < block->num_quads; i++) tm_scope_cycles("Single quad") {
while (block != 0 && block->num_quads > 0) tm_scope_cycles("Quad block") {
for (u64 i = 0; i < block->num_quads; i++) {
Draw_Quad *q = &block->quad_buffer[i];
@ -620,7 +638,7 @@ void gfx_update() {
if (num_textures >= 32) {
// If max textures reached, make a draw call and start over
D3D11_MAPPED_SUBRESOURCE buffer_mapping;
VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_NO_OVERWRITE, 0, &buffer_mapping);
VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping);
memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
@ -676,30 +694,29 @@ void gfx_update() {
}
}
D3D11_MAPPED_SUBRESOURCE buffer_mapping;
VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_NO_OVERWRITE, 0, &buffer_mapping);
memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
tm_scope_cycles("Write to gpu") {
D3D11_MAPPED_SUBRESOURCE buffer_mapping;
tm_scope_cycles("The Map call") {
hr = VTABLE(Map, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0, D3D11_MAP_WRITE_DISCARD, 0, &buffer_mapping);
win32_check_hr(hr);
}
tm_scope_cycles("The memcpy") {
memcpy(buffer_mapping.pData, d3d11_staging_quad_buffer, number_of_rendered_quads*sizeof(D3D11_Vertex)*6);
}
tm_scope_cycles("The Unmap call") {
VTABLE(Unmap, d3d11_context, (ID3D11Resource*)d3d11_quad_vbo, 0);
}
}
///
// Draw call
u64 before_draw = os_get_current_cycle_count();
d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
u64 after_draw = os_get_current_cycle_count();
//log("Draw call took %llu cycles", after_draw-before_draw);
tm_scope_cycles("Draw call") d3d11_draw_call(number_of_rendered_quads, textures, num_textures);
}
f64 rest_after = os_get_current_time_in_seconds();
if (is_key_just_pressed('E'))
log("The rest took %.2fms", (rest_after-rest_before)*1000.0);
f64 before_present = os_get_current_time_in_seconds();
hr = VTABLE(Present, d3d11_swap_chain, gfx._can_vsync && gfx.enable_vsync, 0);
f64 after = os_get_current_time_in_seconds();
if (is_key_just_pressed('E'))
log("Present took %.2fms", (after-before_present)*1000.0);
win32_check_hr(hr);
tm_scope_cycles("Present") {
hr = VTABLE(Present, d3d11_swap_chain, window.enable_vsync, window.enable_vsync ? 0 : DXGI_PRESENT_ALLOW_TEARING);
win32_check_hr(hr);
}
#if CONFIGURATION == DEBUG
///

View file

@ -15,18 +15,6 @@
#error "Unknown renderer GFX_RENDERER defined"
#endif
typedef struct Gfx_State {
// config
bool enable_vsync;
// readonly
bool _can_vsync;
} Gfx_State;
Gfx_State gfx;
forward_global const Gfx_Handle GFX_INVALID_HANDLE;
typedef struct Gfx_Image {

View file

@ -133,7 +133,7 @@ inline float v3_dot_product(Vector3 a, Vector3 b) {
return simd_dot_product_float32_96((float*)&a, (float*)&b);
}
inline float v4_dot_product(Vector4 a, Vector4 b) {
return simd_dot_product_float32_128((float*)&a, (float*)&b);
return simd_dot_product_float32_128_aligned((float*)&a, (float*)&b);
}
Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) {

View file

@ -24,7 +24,7 @@ void* initialization_allocator_proc(u64 size, void *p, Allocator_Message message
if (init_memory_head >= ((u8*)init_memory_arena+INIT_MEMORY_SIZE)) {
os_write_string_to_stdout(STR("Out of initialization memory! Please provide more by increasing INIT_MEMORY_SIZE"));
os_break();
crash();
}
return p;
break;

View file

@ -145,23 +145,22 @@ typedef u8 bool;
#warning "Compiler is not explicitly supported, some things will probably not work as expected"
#endif
#include "cpu.c"
#define DEBUG 0
#define VERY_DEBUG 1
#define RELEASE 2
#if !defined(CONFIGURATION)
#if defined(NDEBUG)
#define CONFIGURATION RELEASE
#else
#define CONFIGURATION DEBUG
#endif
#if defined(NDEBUG)
#define CONFIGURATION RELEASE
#else
#define CONFIGURATION DEBUG
#endif
#include "cpu.c"
#ifndef ENTRY_PROC
#define ENTRY_PROC entry
#endif

View file

@ -135,6 +135,8 @@ LRESULT CALLBACK win32_window_proc(HWND passed_window, UINT message, WPARAM wpar
void os_init(u64 program_memory_size) {
memset(&window, 0, sizeof(window));
timeBeginPeriod(1);
#if CONFIGURATION == RELEASE
SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);

View file

@ -61,36 +61,6 @@ inline int crt_vprintf(const char* fmt, va_list args) {
return os.crt_vprintf(fmt, args);
}
#if !defined(COMPILER_HAS_MEMCPY_INTRINSICS) || CONFIGURATION == DEBUG
inline void* naive_memcpy(void* dest, const void* source, size_t size) {
for (u64 i = 0; i < (u64)size; i++) ((u8*)dest)[i] = ((u8*)source)[i];
return dest;
}
inline void* memcpy(void* dest, const void* source, size_t size) {
if (!os.crt_memcpy) return naive_memcpy(dest, source, size);
return os.crt_memcpy(dest, source, size);
}
inline int naive_memcmp(const void* a, const void* b, size_t amount) {
// I don't understand the return value of memcmp but I also dont care
for (u64 i = 0; i < (u64)amount; i++) {
if (((u8*)a)[i] != ((u8*)b)[i]) return -1;
}
return 0;
}
inline int memcmp(const void* a, const void* b, size_t amount) {
if (!os.crt_memcmp) return naive_memcmp(a, b, amount);
return os.crt_memcmp(a, b, amount);
}
inline void* naive_memset(void* dest, int value, size_t amount) {
for (u64 i = 0; i < (u64)amount; i++) ((u8*)dest)[i] = (u8)value;
return dest;
}
inline void* memset(void* dest, int value, size_t amount) {
if (!os.crt_memset) return naive_memset(dest, value, amount);
return os.crt_memset(dest, value, amount);
}
#endif
inline bool bytes_match(void *a, void *b, u64 count) { return memcmp(a, b, count) == 0; }
inline int vsnprintf(char* buffer, size_t n, const char* fmt, va_list args) {
@ -333,6 +303,7 @@ typedef struct Os_Window {
u32 x;
u32 y;
Vector4 clear_color;
bool enable_vsync;
bool should_close;

View file

@ -30,6 +30,16 @@ inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result);
inline float basic_dot_product_float32_64(float *a, float *b);
inline float basic_dot_product_float32_96(float *a, float *b);
inline float basic_dot_product_float32_128(float *a, float *b);
inline void basic_sqrt_float32_64(float *a, float *result);
inline void basic_sqrt_float32_96(float *a, float *result);
inline void basic_sqrt_float32_128(float *a, float *result);
inline void basic_sqrt_float32_256(float *a, float *result);
inline void basic_sqrt_float32_512(float *a, float *result);
inline void basic_rsqrt_float32_64(float *a, float *result);
inline void basic_rsqrt_float32_96(float *a, float *result);
inline void basic_rsqrt_float32_128(float *a, float *result);
inline void basic_rsqrt_float32_256(float *a, float *result);
inline void basic_rsqrt_float32_512(float *a, float *result);
@ -123,6 +133,52 @@ inline void simd_div_float32_128_aligned(float *a, float *b, float* result) {
__m128 vr = _mm_div_ps(va, vb);
_mm_store_ps(result, vr);
}
inline void simd_sqrt_float32_96(float *a, float *result) {
__m128 va = _mm_loadu_ps(a);
va = _mm_and_ps(va, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); // Mask last element
__m128 vr = _mm_sqrt_ps(va);
_mm_storeu_ps(result, vr);
}
inline void simd_rsqrt_float32_96(float *a, float *result) {
__m128 va = _mm_loadu_ps(a);
va = _mm_and_ps(va, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); // Mask last element
__m128 vr = _mm_rsqrt_ps(va);
_mm_storeu_ps(result, vr);
}
inline void simd_sqrt_float32_64(float *a, float *result) {
__m128 va = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a);
__m128 vr = _mm_sqrt_ps(va);
_mm_storel_pi((__m64*)result, vr);
}
inline void simd_rsqrt_float32_64(float *a, float *result) {
__m128 va = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a);
__m128 vr = _mm_rsqrt_ps(va);
_mm_storel_pi((__m64*)result, vr);
}
inline void simd_sqrt_float32_128(float *a, float *result) {
__m128 va = _mm_loadu_ps(a);
__m128 vr = _mm_sqrt_ps(va);
_mm_storeu_ps(result, vr);
}
inline void simd_rsqrt_float32_128(float *a, float *result) {
__m128 va = _mm_loadu_ps(a);
__m128 vr = _mm_rsqrt_ps(va);
_mm_storeu_ps(result, vr);
}
inline void simd_sqrt_float32_128_aligned(float *a, float *result) {
__m128 va = _mm_load_ps(a);
__m128 vr = _mm_sqrt_ps(va);
_mm_store_ps(result, vr);
}
inline void simd_rsqrt_float32_128_aligned(float *a, float *result) {
__m128 va = _mm_load_ps(a);
__m128 vr = _mm_rsqrt_ps(va);
_mm_store_ps(result, vr);
}
#if SIMD_ENABLE_SSE2
@ -191,14 +247,6 @@ inline float simd_dot_product_float32_96(float *a, float *b) {
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
return _mm_cvtss_f32(dot_product);
}
inline float simd_dot_product_float32_96_aligned(float *a, float *b) {
__m128 vec1 = _mm_load_ps(a);
__m128 vec2 = _mm_load_ps(b);
vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
return _mm_cvtss_f32(dot_product);
}
inline float simd_dot_product_float32_128(float *a, float *b) {
__m128 vec1 = _mm_loadu_ps(a);
__m128 vec2 = _mm_loadu_ps(b);
@ -217,8 +265,6 @@ inline float simd_dot_product_float32_128_aligned(float *a, float *b) {
#define simd_dot_product_float32_64 basic_dot_product_float32_64
#define simd_dot_product_float32_96 basic_dot_product_float32_96
#define simd_dot_product_float32_128 basic_dot_product_float32_128
#define simd_dot_product_float32_64_aligned basic_dot_product_float32_64
#define simd_dot_product_float32_96_aligned basic_dot_product_float32_96
#define simd_dot_product_float32_128_aligned basic_dot_product_float32_128
#endif // SIMD_ENABLE_SSE41
@ -275,16 +321,41 @@ inline void simd_div_float32_256_aligned(float32 *a, float32 *b, float32* result
__m256 vr = _mm256_div_ps(va, vb);
_mm256_store_ps(result, vr);
}
inline void simd_sqrt_float32_256(float *a, float *result) {
__m256 va = _mm256_loadu_ps(a);
__m256 vr = _mm256_sqrt_ps(va);
_mm256_storeu_ps(result, vr);
}
inline void simd_rsqrt_float32_256(float *a, float *result) {
__m256 va = _mm256_loadu_ps(a);
__m256 vr = _mm256_rsqrt_ps(va);
_mm256_storeu_ps(result, vr);
}
inline void simd_sqrt_float32_256_aligned(float *a, float *result) {
__m256 va = _mm256_load_ps(a);
__m256 vr = _mm256_sqrt_ps(va);
_mm256_store_ps(result, vr);
}
inline void simd_rsqrt_float32_256_aligned(float *a, float *result) {
__m256 va = _mm256_load_ps(a);
__m256 vr = _mm256_rsqrt_ps(va);
_mm256_store_ps(result, vr);
}
#else
#define simd_add_float32_256 basic_add_float32_256
#define simd_sub_float32_256 basic_sub_float32_256
#define simd_mul_float32_256 basic_mul_float32_256
#define simd_div_float32_256 basic_div_float32_256
#define simd_sqrt_float32_256 basic_sqrt_float32_256
#define simd_rsqrt_float32_256 basic_rsqrt_float32_256
#define simd_add_float32_256_aligned basic_add_float32_256
#define simd_sub_float32_256_aligned basic_sub_float32_256
#define simd_mul_float32_256_aligned basic_mul_float32_256
#define simd_div_float32_256_aligned basic_div_float32_256
#define simd_sqrt_float32_256_aligned basic_sqrt_float32_256
#define simd_rsqrt_float32_256_aligned basic_rsqrt_float32_256
#endif
#if SIMD_ENABLE_AVX2
@ -332,7 +403,6 @@ inline void simd_mul_int32_256_aligned(s32 *a, s32 *b, s32* result) {
#define simd_add_int32_256 basic_add_int32_256
#define simd_sub_int32_256 basic_sub_int32_256
#define simd_mul_int32_256 basic_mul_int32_256
#define simd_add_int32_256_aligned basic_add_int32_256
#define simd_sub_int32_256_aligned basic_sub_int32_256
#define simd_mul_int32_256_aligned basic_mul_int32_256
@ -432,6 +502,28 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
__m512i vr = _mm512_mullo_epi32(va, vb);
_mm512_store_si512((__m512i*)result, vr);
}
inline void simd_sqrt_float32_512(float *a, float *result) {
__m512 va = _mm512_loadu_ps(a);
__m512 vr = _mm512_sqrt_ps(va);
_mm512_storeu_ps(result, vr);
}
inline void simd_rsqrt_float32_512(float *a, float *result) {
__m512 va = _mm512_loadu_ps(a);
__m512 vr = _mm512_rsqrt14_ps(va); // AVX-512 does not have _mm512_rsqrt_ps
_mm512_storeu_ps(result, vr);
}
inline void simd_sqrt_float32_512_aligned(float *a, float *result) {
__m512 va = _mm512_load_ps(a);
__m512 vr = _mm512_sqrt_ps(va);
_mm512_store_ps(result, vr);
}
inline void simd_rsqrt_float32_512_aligned(float *a, float *result) {
__m512 va = _mm512_load_ps(a);
__m512 vr = _mm512_rsqrt14_ps(va);
_mm512_store_ps(result, vr);
}
#else
#define simd_add_float32_512 basic_add_float32_512
#define simd_sub_float32_512 basic_sub_float32_512
@ -440,7 +532,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512 basic_add_int32_512
#define simd_sub_int32_512 basic_sub_int32_512
#define simd_mul_int32_512 basic_mul_int32_512
#define simd_sqrt_float32_512 basic_sqrt_float32_512
#define simd_rsqrt_float32_512 basic_rsqrt_float32_512
#define simd_add_float32_512_aligned basic_add_float32_512
#define simd_sub_float32_512_aligned basic_sub_float32_512
#define simd_mul_float32_512_aligned basic_mul_float32_512
@ -448,6 +541,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512_aligned basic_add_int32_512
#define simd_sub_int32_512_aligned basic_sub_int32_512
#define simd_mul_int32_512_aligned basic_mul_int32_512
#define simd_sqrt_float32_512_aligned basic_sqrt_float32_512
#define simd_rsqrt_float32_512_aligned basic_rsqrt_float32_512
#endif // SIMD_ENABLE_AVX512
#else
@ -461,10 +556,16 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_mul_float32_128 basic_mul_float32_128
#define simd_div_float32_64 basic_div_float32_64
#define simd_div_float32_128 basic_div_float32_128
#define simd_sqrt_float32_64 basic_sqrt_float32_64
#define simd_sqrt_float32_128 basic_sqrt_float32_128
#define simd_rsqrt_float32_64 basic_rsqrt_float32_64
#define simd_rsqrt_float32_128 basic_rsqrt_float32_128
#define simd_add_float32_128_aligned basic_add_float32_128
#define simd_sub_float32_128_aligned basic_sub_float32_128
#define simd_mul_float32_128_aligned basic_mul_float32_128
#define simd_div_float32_128_aligned basic_div_float32_128
#define simd_sqrt_float32_128_aligned basic_sqrt_float32_128
#define simd_rsqrt_float32_128_aligned basic_rsqrt_float32_128
// SSE2
#define simd_add_int32_128 basic_add_int32_128
@ -475,19 +576,26 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_mul_int32_128_aligned basic_mul_int32_128
// SSE41
#define simd_mul_int32_128 basic_mul_int32_128
#define simd_mul_int32_128_aligned basic_mul_int32_128
#define simd_dot_product_float32_64 basic_dot_product_float32_64
#define simd_dot_product_float32_96 basic_dot_product_float32_96
#define simd_dot_product_float32_128 basic_dot_product_float32_128
#define simd_dot_product_float32_128_aligned basic_dot_product_float32_128
// AVX
#define simd_add_float32_256 basic_add_float32_256
#define simd_sub_float32_256 basic_sub_float32_256
#define simd_mul_float32_256 basic_mul_float32_256
#define simd_div_float32_256 basic_div_float32_256
#define simd_sqrt_float32_256 basic_sqrt_float32_256
#define simd_rsqrt_float32_256 basic_rsqrt_float32_256
#define simd_add_float32_256_aligned basic_add_float32_256
#define simd_sub_float32_256_aligned basic_sub_float32_256
#define simd_mul_float32_256_aligned basic_mul_float32_256
#define simd_div_float32_256_aligned basic_div_float32_256
#define simd_sqrt_float32_256_aligned basic_sqrt_float32_256
#define simd_rsqrt_float32_256_aligned basic_rsqrt_float32_256
// AVX2
#define simd_add_int32_256 basic_add_int32_256
@ -505,6 +613,8 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512 basic_add_int32_512
#define simd_sub_int32_512 basic_sub_int32_512
#define simd_mul_int32_512 basic_mul_int32_512
#define simd_sqrt_float32_512 basic_sqrt_float32_512
#define simd_rsqrt_float32_512 basic_rsqrt_float32_512
#define simd_add_float32_512_aligned basic_add_float32_512
#define simd_sub_float32_512_aligned basic_sub_float32_512
#define simd_mul_float32_512_aligned basic_mul_float32_512
@ -512,9 +622,14 @@ inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512_aligned basic_add_int32_512
#define simd_sub_int32_512_aligned basic_sub_int32_512
#define simd_mul_int32_512_aligned basic_mul_int32_512
#define simd_sqrt_float32_512_aligned basic_sqrt_float32_512
#define simd_rsqrt_float32_512_aligned basic_rsqrt_float32_512
#endif
double __cdecl sqrt(_In_ double _X);
double __cdecl rsqrt(_In_ double _X);
inline void basic_add_float32_64 (float32 *a, float32 *b, float32* result) {
result[0] = a[0] + b[0];
result[1] = a[1] + b[1];
@ -638,6 +753,55 @@ inline float basic_dot_product_float32_96(float *a, float *b) {
inline float basic_dot_product_float32_128(float *a, float *b) {
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
}
inline void basic_sqrt_float32_64(float *a, float *result) {
result[0] = sqrt(a[0]);
result[1] = sqrt(a[1]);
}
inline void basic_sqrt_float32_96(float *a, float *result) {
result[0] = sqrt(a[0]);
result[1] = sqrt(a[1]);
result[2] = sqrt(a[2]);
}
inline void basic_sqrt_float32_128(float *a, float *result) {
result[0] = sqrt(a[0]);
result[1] = sqrt(a[1]);
result[2] = sqrt(a[2]);
result[3] = sqrt(a[3]);
}
inline void basic_sqrt_float32_256(float *a, float *result) {
basic_sqrt_float32_128(a, result);
basic_sqrt_float32_128(a+4, result+4);
}
inline void basic_sqrt_float32_512(float *a, float *result) {
basic_sqrt_float32_256(a, result);
basic_sqrt_float32_256(a+8, result+8);
}
inline void basic_rsqrt_float32_64(float *a, float *result) {
result[0] = rsqrt(a[0]);
result[1] = rsqrt(a[1]);
}
inline void basic_rsqrt_float32_96(float *a, float *result) {
result[0] = rsqrt(a[0]);
result[1] = rsqrt(a[1]);
result[2] = rsqrt(a[2]);
}
inline void basic_rsqrt_float32_128(float *a, float *result) {
result[0] = rsqrt(a[0]);
result[1] = rsqrt(a[1]);
result[2] = rsqrt(a[2]);
result[3] = rsqrt(a[3]);
}
inline void basic_rsqrt_float32_256(float *a, float *result) {
basic_rsqrt_float32_128(a, result);
basic_rsqrt_float32_128(a+4, result+4);
}
inline void basic_rsqrt_float32_512(float *a, float *result) {
basic_rsqrt_float32_256(a, result);
basic_rsqrt_float32_256(a+8, result+8);
}

View file

@ -5,7 +5,6 @@
*/
void * memcpy (void *,const void *,size_t);
void* talloc(u64);
typedef struct string {
@ -13,6 +12,7 @@ typedef struct string {
u8 *data;
} string;
#define fixed_string STR
#define STR(s) ((string){ length_of_null_terminated_string((const char*)s), (u8*)s })
inline u64 length_of_null_terminated_string(const char* cstring) {

View file

@ -213,7 +213,7 @@ void printf(const char* fmt, ...) {
typedef void(*Logger_Proc)(Log_Level level, string s);
#define LOG_BASE(level, ...) If context.logger then ((Logger_Proc)context.logger)(level, tprint(__VA_ARGS__))
#define LOG_BASE(level, ...) if (context.logger) ((Logger_Proc)context.logger)(level, tprint(__VA_ARGS__))
#define log_verbose(...) LOG_BASE(LOG_VERBOSE, __VA_ARGS__)

View file

@ -1,6 +1,6 @@
// Custom allocators for lodepng
Allocator get_heap_allocator();
Allocator lodepng_allocator = {0};
/*Allocator lodepng_allocator = {0};
void* lodepng_malloc(size_t size) {
#ifdef LODEPNG_MAX_ALLOC
if(size > LODEPNG_MAX_ALLOC) return 0;
@ -28,4 +28,47 @@ void lodepng_free(void* ptr) {
#define LODEPNG_NO_COMPILE_ENCODER
// One day I might write my own png decoder so we don't even need this
#include "third_party/lodepng.h"
#include "third_party/lodepng.c"
#include "third_party/lodepng.c"*/
#define STB_TRUETYPE_IMPLEMENTATION
#define STB_IMAGE_IMPLEMENTATION
typedef unsigned char u8;
typedef signed char s8;
typedef unsigned short u16;
typedef signed short s16;
typedef unsigned int u32;
typedef signed int s32;
void *stbtt_malloc(size_t size) {
if (!size) return 0;
return alloc(get_heap_allocator(), size);
}
#define STBTT_malloc(x,u) ((void)(u),stbtt_malloc(x))
void stbtt_free(void *p) {
if (!p) return;
dealloc(get_heap_allocator(), p);
}
#define STBTT_free(x,u) ((void)(u),stbtt_free(x))
#define STBTT_assert(x) assert(x)
size_t stbtt_strlen(const char* str) {
size_t count = 0;
while (str[count] != 0) count += 1;
return count;
}
#define STBTT_strlen(x) stbtt_strlen(x)
#define STBTT_memcpy memcpy
#define STBTT_memset memset
#define STBI_NO_STDIO
#define STBI_ASSERT(x) {if (!(x)) *(volatile char*)0 = 0;}
#define STBI_MALLOC(sz) stbtt_malloc(sz)
#define STBI_REALLOC(p,newsz) get_heap_allocator().proc(newsz, p, ALLOCATOR_REALLOCATE, 0)
#define STBI_FREE(p) stbtt_free(p)
#include "third_party/stb_image.h"
#include "third_party/stb_truetype.h"

7988
oogabooga/third_party/stb_image.h vendored Normal file

File diff suppressed because it is too large Load diff

5080
oogabooga/third_party/stb_truetype.h vendored Normal file

File diff suppressed because it is too large Load diff