diff --git a/build.bat b/build.bat index 0636236..e9a44ce 100644 --- a/build.bat +++ b/build.bat @@ -6,6 +6,6 @@ mkdir build pushd build -clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler +clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -msse4.1 popd \ No newline at end of file diff --git a/build.c b/build.c index 4851453..67eea87 100644 --- a/build.c +++ b/build.c @@ -3,14 +3,14 @@ /// // Build config stuff -#define RUN_TESTS 0 +#define RUN_TESTS 1 // This is only for people developing oogabooga! #define OOGABOOGA_DEV 1 #define ENABLE_PROFILING 0 -// Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which don't. +// ENABLE_SIMD Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which doesn't #define ENABLE_SIMD 1 #define INITIAL_PROGRAM_MEMORY_SIZE MB(5) diff --git a/build_release.bat b/build_release.bat index b627f50..dc42650 100644 --- a/build_release.bat +++ b/build_release.bat @@ -7,7 +7,7 @@ pushd build mkdir release pushd release -clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -mavx2 -mavx512f -msse4.1 -msse2 -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions +clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -msse4.1 popd popd \ No newline at end of file diff --git a/oogabooga/cpu.c b/oogabooga/cpu.c index bf3ca5d..6cacb82 100644 --- a/oogabooga/cpu.c +++ b/oogabooga/cpu.c @@ -27,6 +27,7 @@ typedef struct Cpu_Capabilities { // Compiler specific stuff #if COMPILER_MVSC #define inline __forceinline + #define alignat(x) __declspec(align(x)) #define COMPILER_HAS_MEMCPY_INTRINSICS 1 #include #pragma intrinsic(__rdtsc) @@ -63,6 +64,7 @@ typedef struct Cpu_Capabilities { #endif #elif COMPILER_GCC || COMPILER_CLANG #define inline __attribute__((always_inline)) inline + #define alignat(x) __attribute__((aligned(x))) #define COMPILER_HAS_MEMCPY_INTRINSICS 1 inline u64 rdtsc() { unsigned int lo, hi; diff --git a/oogabooga/linmath.c b/oogabooga/linmath.c index 3ad91c8..3603d59 100644 --- a/oogabooga/linmath.c +++ b/oogabooga/linmath.c @@ -13,13 +13,13 @@ #define to_radians32 to_radians #define to_degrees32 to_degrees -typedef union Vector2 { +typedef alignat(16) union Vector2 { struct {float32 x, y;}; } Vector2; inline Vector2 v2(float32 x, float32 y) { return (Vector2){x, y}; } #define v2_expand(v) (v).x, (v).y -typedef union Vector3 { +typedef alignat(16) union Vector3 { struct {float32 x, y, z;}; struct {float32 r, g, b;}; struct {Vector2 xy;}; @@ -28,7 +28,7 @@ typedef union Vector3 { inline Vector3 v3(float32 x, float32 y, float32 z) { return (Vector3){x, y, z}; } #define v3_expand(v) (v).x, (v).y, (v).z -typedef union Vector4 { +typedef alignat(16) union Vector4 { struct {float32 x, y, z, w;}; struct {float32 x1, y1, x2, y2;}; struct {float32 r, g, b, a;}; @@ -66,19 +66,19 @@ inline Vector2 v2_divf(Vector2 a, float32 s) { inline Vector3 v3_add(Vector3 a, Vector3 b) { Vector4 a128 = v4(a.x, a.y, a.z, 0.0); Vector4 b128 = v4(b.x, b.y, b.z, 0.0); - simd_add_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); + simd_add_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128); return a128.xyz; } inline Vector3 v3_sub(Vector3 a, Vector3 b) { Vector4 a128 = v4(a.x, a.y, a.z, 0.0); Vector4 b128 = v4(b.x, b.y, b.z, 0.0); - simd_sub_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); + simd_sub_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128); return a128.xyz; } inline Vector3 v3_mul(Vector3 a, Vector3 b) { Vector4 a128 = v4(a.x, a.y, a.z, 0.0); Vector4 b128 = v4(b.x, b.y, b.z, 0.0); - simd_mul_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); + simd_mul_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128); return a128.xyz; } inline Vector3 v3_mulf(Vector3 a, float32 s) { @@ -87,7 +87,7 @@ inline Vector3 v3_mulf(Vector3 a, float32 s) { inline Vector3 v3_div(Vector3 a, Vector3 b) { Vector4 a128 = v4(a.x, a.y, a.z, 0.0); Vector4 b128 = v4(b.x, b.y, b.z, 0.0); - simd_div_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); + simd_div_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128); return a128.xyz; } inline Vector3 v3_divf(Vector3 a, float32 s) { @@ -95,28 +95,29 @@ inline Vector3 v3_divf(Vector3 a, float32 s) { } inline Vector4 v4_add(Vector4 a, Vector4 b) { - simd_add_float32_128((f32*)&a, (f32*)&b, (f32*)&a); + simd_add_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a); return a; } inline Vector4 v4_sub(Vector4 a, Vector4 b) { - simd_sub_float32_128((f32*)&a, (f32*)&b, (f32*)&a); + simd_sub_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a); return a; } inline Vector4 v4_mul(Vector4 a, Vector4 b) { - simd_mul_float32_128((f32*)&a, (f32*)&b, (f32*)&a); + simd_mul_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a); return a; } inline Vector4 v4_mulf(Vector4 a, float32 s) { return v4_mul(a, v4(s, s, s, s)); } inline Vector4 v4_div(Vector4 a, Vector4 b) { - simd_div_float32_128((f32*)&a, (f32*)&b, (f32*)&a); + simd_div_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a); return a; } inline Vector4 v4_divf(Vector4 a, float32 s) { return v4_div(a, v4(s, s, s, s)); } + inline Vector2 v2_normalize(Vector2 a) { float32 length = sqrt(a.x * a.x + a.y * a.y); if (length == 0) { @@ -125,6 +126,15 @@ inline Vector2 v2_normalize(Vector2 a) { return v2_divf(a, length); } +inline float v2_dot_product(Vector2 a, Vector2 b) { + return simd_dot_product_float32_64((float*)&a, (float*)&b); +} +inline float v3_dot_product(Vector3 a, Vector3 b) { + return simd_dot_product_float32_96((float*)&a, (float*)&b); +} +inline float v4_dot_product(Vector4 a, Vector4 b) { + return simd_dot_product_float32_128((float*)&a, (float*)&b); +} Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) { float32 s = sin(rotation_radians); diff --git a/oogabooga/simd.c b/oogabooga/simd.c index 3beaec9..c73761b 100644 --- a/oogabooga/simd.c +++ b/oogabooga/simd.c @@ -27,6 +27,10 @@ inline void basic_mul_int32_128(s32 *a, s32 *b, s32* result); inline void basic_mul_int32_256(s32 *a, s32 *b, s32* result); inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result); +inline float basic_dot_product_float32_64(float *a, float *b); +inline float basic_dot_product_float32_96(float *a, float *b); +inline float basic_dot_product_float32_128(float *a, float *b); + #if ENABLE_SIMD @@ -74,6 +78,12 @@ inline void simd_add_float32_128(float *a, float *b, float* result) { __m128 vr = _mm_add_ps(va, vb); _mm_storeu_ps(result, vr); } +inline void simd_add_float32_128_aligned(float *a, float *b, float* result) { + __m128 va = _mm_load_ps(a); + __m128 vb = _mm_load_ps(b); + __m128 vr = _mm_add_ps(va, vb); + _mm_store_ps(result, vr); +} inline void simd_sub_float32_128(float *a, float *b, float* result) { __m128 va = _mm_loadu_ps(a); @@ -81,6 +91,12 @@ inline void simd_sub_float32_128(float *a, float *b, float* result) { __m128 vr = _mm_sub_ps(va, vb); _mm_storeu_ps(result, vr); } +inline void simd_sub_float32_128_aligned(float *a, float *b, float* result) { + __m128 va = _mm_load_ps(a); + __m128 vb = _mm_load_ps(b); + __m128 vr = _mm_sub_ps(va, vb); + _mm_store_ps(result, vr); +} inline void simd_mul_float32_128(float *a, float *b, float* result) { __m128 va = _mm_loadu_ps(a); @@ -88,6 +104,12 @@ inline void simd_mul_float32_128(float *a, float *b, float* result) { __m128 vr = _mm_mul_ps(va, vb); _mm_storeu_ps(result, vr); } +inline void simd_mul_float32_128_aligned(float *a, float *b, float* result) { + __m128 va = _mm_load_ps(a); + __m128 vb = _mm_load_ps(b); + __m128 vr = _mm_mul_ps(va, vb); + _mm_store_ps(result, vr); +} inline void simd_div_float32_128(float *a, float *b, float* result) { __m128 va = _mm_loadu_ps(a); @@ -95,6 +117,13 @@ inline void simd_div_float32_128(float *a, float *b, float* result) { __m128 vr = _mm_div_ps(va, vb); _mm_storeu_ps(result, vr); } +inline void simd_div_float32_128_aligned(float *a, float *b, float* result) { + __m128 va = _mm_load_ps(a); + __m128 vb = _mm_load_ps(b); + __m128 vr = _mm_div_ps(va, vb); + _mm_store_ps(result, vr); +} + #if SIMD_ENABLE_SSE2 // SSE2 @@ -107,17 +136,32 @@ inline void simd_add_int32_128(s32 *a, s32 *b, s32* result) { __m128i vr = _mm_add_epi32(va, vb); _mm_storeu_si128((__m128i*)result, vr); } +inline void simd_add_int32_128_aligned(s32 *a, s32 *b, s32* result) { + __m128i va = _mm_load_si128((__m128i*)a); + __m128i vb = _mm_load_si128((__m128i*)b); + __m128i vr = _mm_add_epi32(va, vb); + _mm_store_si128((__m128i*)result, vr); +} inline void simd_sub_int32_128(s32 *a, s32 *b, s32* result) { __m128i va = _mm_loadu_si128((__m128i*)a); __m128i vb = _mm_loadu_si128((__m128i*)b); __m128i vr = _mm_sub_epi32(va, vb); _mm_storeu_si128((__m128i*)result, vr); } +inline void simd_sub_int32_128_aligned(s32 *a, s32 *b, s32* result) { + __m128i va = _mm_load_si128((__m128i*)a); + __m128i vb = _mm_load_si128((__m128i*)b); + __m128i vr = _mm_sub_epi32(va, vb); + _mm_store_si128((__m128i*)result, vr); +} #else #define simd_add_int32_128 basic_add_int32_128 #define simd_sub_int32_128 basic_sub_int32_128 + #define simd_add_int32_128_aligned basic_add_int32_128 + #define simd_sub_int32_128_aligned basic_sub_int32_128 + #endif #if SIMD_ENABLE_SSE41 @@ -127,8 +171,55 @@ inline void simd_mul_int32_128(s32 *a, s32 *b, s32* result) { __m128i vr = _mm_mullo_epi32(va, vb); _mm_storeu_si128((__m128i*)result, vr); } +inline void simd_mul_int32_128_aligned(s32 *a, s32 *b, s32* result) { + __m128i va = _mm_load_si128((__m128i*)a); + __m128i vb = _mm_load_si128((__m128i*)b); + __m128i vr = _mm_mullo_epi32(va, vb); + _mm_store_si128((__m128i*)result, vr); +} +inline float simd_dot_product_float32_64(float *a, float *b) { + __m128 vec1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a); + __m128 vec2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)b); + __m128 dot_product = _mm_dp_ps(vec1, vec2, 0x31); + return _mm_cvtss_f32(dot_product); +} +inline float simd_dot_product_float32_96(float *a, float *b) { + __m128 vec1 = _mm_loadu_ps(a); + __m128 vec2 = _mm_loadu_ps(b); + vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); + vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); + __m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71); + return _mm_cvtss_f32(dot_product); +} +inline float simd_dot_product_float32_96_aligned(float *a, float *b) { + __m128 vec1 = _mm_load_ps(a); + __m128 vec2 = _mm_load_ps(b); + vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); + vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); + __m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71); + return _mm_cvtss_f32(dot_product); +} +inline float simd_dot_product_float32_128(float *a, float *b) { + __m128 vec1 = _mm_loadu_ps(a); + __m128 vec2 = _mm_loadu_ps(b); + __m128 dot_product = _mm_dp_ps(vec1, vec2, 0xF1); + return _mm_cvtss_f32(dot_product); +} +inline float simd_dot_product_float32_128_aligned(float *a, float *b) { + __m128 vec1 = _mm_load_ps(a); + __m128 vec2 = _mm_load_ps(b); + __m128 dot_product = _mm_dp_ps(vec1, vec2, 0xF1); + return _mm_cvtss_f32(dot_product); +} #else #define simd_mul_int32_128 basic_mul_int32_128 + #define simd_mul_int32_128_aligned basic_mul_int32_128 + #define simd_dot_product_float32_64 basic_dot_product_float32_64 + #define simd_dot_product_float32_96 basic_dot_product_float32_96 + #define simd_dot_product_float32_128 basic_dot_product_float32_128 + #define simd_dot_product_float32_64_aligned basic_dot_product_float32_64 + #define simd_dot_product_float32_96_aligned basic_dot_product_float32_96 + #define simd_dot_product_float32_128_aligned basic_dot_product_float32_128 #endif // SIMD_ENABLE_SSE41 #if SIMD_ENABLE_AVX @@ -142,29 +233,58 @@ inline void simd_add_float32_256(float32 *a, float32 *b, float32* result) { __m256 vr = _mm256_add_ps(va, vb); _mm256_storeu_ps(result, vr); } +inline void simd_add_float32_256_aligned(float32 *a, float32 *b, float32* result) { + __m256 va = _mm256_load_ps(a); + __m256 vb = _mm256_load_ps(b); + __m256 vr = _mm256_add_ps(va, vb); + _mm256_store_ps(result, vr); +} inline void simd_sub_float32_256(float32 *a, float32 *b, float32* result) { __m256 va = _mm256_loadu_ps(a); __m256 vb = _mm256_loadu_ps(b); __m256 vr = _mm256_sub_ps(va, vb); _mm256_storeu_ps(result, vr); } +inline void simd_sub_float32_256_aligned(float32 *a, float32 *b, float32* result) { + __m256 va = _mm256_load_ps(a); + __m256 vb = _mm256_load_ps(b); + __m256 vr = _mm256_sub_ps(va, vb); + _mm256_store_ps(result, vr); +} inline void simd_mul_float32_256(float32 *a, float32 *b, float32* result) { __m256 va = _mm256_loadu_ps(a); __m256 vb = _mm256_loadu_ps(b); __m256 vr = _mm256_mul_ps(va, vb); _mm256_storeu_ps(result, vr); } +inline void simd_mul_float32_256_aligned(float32 *a, float32 *b, float32* result) { + __m256 va = _mm256_load_ps(a); + __m256 vb = _mm256_load_ps(b); + __m256 vr = _mm256_mul_ps(va, vb); + _mm256_store_ps(result, vr); +} inline void simd_div_float32_256(float32 *a, float32 *b, float32* result){ __m256 va = _mm256_loadu_ps(a); __m256 vb = _mm256_loadu_ps(b); __m256 vr = _mm256_div_ps(va, vb); _mm256_storeu_ps(result, vr); } +inline void simd_div_float32_256_aligned(float32 *a, float32 *b, float32* result){ + __m256 va = _mm256_load_ps(a); + __m256 vb = _mm256_load_ps(b); + __m256 vr = _mm256_div_ps(va, vb); + _mm256_store_ps(result, vr); +} #else #define simd_add_float32_256 basic_add_float32_256 #define simd_sub_float32_256 basic_sub_float32_256 #define simd_mul_float32_256 basic_mul_float32_256 #define simd_div_float32_256 basic_div_float32_256 + + #define simd_add_float32_256_aligned basic_add_float32_256 + #define simd_sub_float32_256_aligned basic_sub_float32_256 + #define simd_mul_float32_256_aligned basic_mul_float32_256 + #define simd_div_float32_256_aligned basic_div_float32_256 #endif #if SIMD_ENABLE_AVX2 @@ -178,22 +298,44 @@ inline void simd_add_int32_256(s32 *a, s32 *b, s32* result) { __m256i vr = _mm256_add_epi32(va, vb); _mm256_storeu_si256((__m256i*)result, vr); } +inline void simd_add_int32_256_aligned(s32 *a, s32 *b, s32* result) { + __m256i va = _mm256_load_si256((__m256i*)a); + __m256i vb = _mm256_load_si256((__m256i*)b); + __m256i vr = _mm256_add_epi32(va, vb); + _mm256_store_si256((__m256i*)result, vr); +} inline void simd_sub_int32_256(s32 *a, s32 *b, s32* result) { __m256i va = _mm256_loadu_si256((__m256i*)a); __m256i vb = _mm256_loadu_si256((__m256i*)b); __m256i vr = _mm256_sub_epi32(va, vb); _mm256_storeu_si256((__m256i*)result, vr); } +inline void simd_sub_int32_256_aligned(s32 *a, s32 *b, s32* result) { + __m256i va = _mm256_load_si256((__m256i*)a); + __m256i vb = _mm256_load_si256((__m256i*)b); + __m256i vr = _mm256_sub_epi32(va, vb); + _mm256_store_si256((__m256i*)result, vr); +} inline void simd_mul_int32_256(s32 *a, s32 *b, s32* result) { __m256i va = _mm256_loadu_si256((__m256i*)a); __m256i vb = _mm256_loadu_si256((__m256i*)b); __m256i vr = _mm256_mullo_epi32(va, vb); _mm256_storeu_si256((__m256i*)result, vr); } +inline void simd_mul_int32_256_aligned(s32 *a, s32 *b, s32* result) { + __m256i va = _mm256_load_si256((__m256i*)a); + __m256i vb = _mm256_load_si256((__m256i*)b); + __m256i vr = _mm256_mullo_epi32(va, vb); + _mm256_store_si256((__m256i*)result, vr); +} #else #define simd_add_int32_256 basic_add_int32_256 #define simd_sub_int32_256 basic_sub_int32_256 #define simd_mul_int32_256 basic_mul_int32_256 + + #define simd_add_int32_256_aligned basic_add_int32_256 + #define simd_sub_int32_256_aligned basic_sub_int32_256 + #define simd_mul_int32_256_aligned basic_mul_int32_256 #endif #if SIMD_ENABLE_AVX512 @@ -207,6 +349,12 @@ inline void simd_add_float32_512(float *a, float *b, float* result) { __m512 vr = _mm512_add_ps(va, vb); _mm512_storeu_ps(result, vr); } +inline void simd_add_float32_512_aligned(float *a, float *b, float* result) { + __m512 va = _mm512_load_ps(a); + __m512 vb = _mm512_load_ps(b); + __m512 vr = _mm512_add_ps(va, vb); + _mm512_store_ps(result, vr); +} inline void simd_sub_float32_512(float *a, float *b, float* result) { __m512 va = _mm512_loadu_ps(a); @@ -214,6 +362,12 @@ inline void simd_sub_float32_512(float *a, float *b, float* result) { __m512 vr = _mm512_sub_ps(va, vb); _mm512_storeu_ps(result, vr); } +inline void simd_sub_float32_512_aligned(float *a, float *b, float* result) { + __m512 va = _mm512_load_ps(a); + __m512 vb = _mm512_load_ps(b); + __m512 vr = _mm512_sub_ps(va, vb); + _mm512_store_ps(result, vr); +} inline void simd_mul_float32_512(float *a, float *b, float* result) { __m512 va = _mm512_loadu_ps(a); @@ -221,6 +375,12 @@ inline void simd_mul_float32_512(float *a, float *b, float* result) { __m512 vr = _mm512_mul_ps(va, vb); _mm512_storeu_ps(result, vr); } +inline void simd_mul_float32_512_aligned(float *a, float *b, float* result) { + __m512 va = _mm512_load_ps(a); + __m512 vb = _mm512_load_ps(b); + __m512 vr = _mm512_mul_ps(va, vb); + _mm512_store_ps(result, vr); +} inline void simd_div_float32_512(float *a, float *b, float* result) { __m512 va = _mm512_loadu_ps(a); @@ -228,12 +388,24 @@ inline void simd_div_float32_512(float *a, float *b, float* result) { __m512 vr = _mm512_div_ps(va, vb); _mm512_storeu_ps(result, vr); } +inline void simd_div_float32_512_aligned(float *a, float *b, float* result) { + __m512 va = _mm512_load_ps(a); + __m512 vb = _mm512_load_ps(b); + __m512 vr = _mm512_div_ps(va, vb); + _mm512_store_ps(result, vr); +} inline void simd_add_int32_512(int32 *a, int32 *b, int32* result) { __m512i va = _mm512_loadu_si512((__m512i*)a); __m512i vb = _mm512_loadu_si512((__m512i*)b); __m512i vr = _mm512_add_epi32(va, vb); _mm512_storeu_si512((__m512i*)result, vr); } +inline void simd_add_int32_512_aligned(int32 *a, int32 *b, int32* result) { + __m512i va = _mm512_load_si512((__m512i*)a); + __m512i vb = _mm512_load_si512((__m512i*)b); + __m512i vr = _mm512_add_epi32(va, vb); + _mm512_store_si512((__m512i*)result, vr); +} inline void simd_sub_int32_512(int32 *a, int32 *b, int32* result) { __m512i va = _mm512_loadu_si512((__m512i*)a); @@ -241,6 +413,12 @@ inline void simd_sub_int32_512(int32 *a, int32 *b, int32* result) { __m512i vr = _mm512_sub_epi32(va, vb); _mm512_storeu_si512((__m512i*)result, vr); } +inline void simd_sub_int32_512_aligned(int32 *a, int32 *b, int32* result) { + __m512i va = _mm512_load_si512((__m512i*)a); + __m512i vb = _mm512_load_si512((__m512i*)b); + __m512i vr = _mm512_sub_epi32(va, vb); + _mm512_store_si512((__m512i*)result, vr); +} inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) { __m512i va = _mm512_loadu_si512((__m512i*)a); @@ -248,6 +426,12 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) { __m512i vr = _mm512_mullo_epi32(va, vb); _mm512_storeu_si512((__m512i*)result, vr); } +inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) { + __m512i va = _mm512_load_si512((__m512i*)a); + __m512i vb = _mm512_load_si512((__m512i*)b); + __m512i vr = _mm512_mullo_epi32(va, vb); + _mm512_store_si512((__m512i*)result, vr); +} #else #define simd_add_float32_512 basic_add_float32_512 #define simd_sub_float32_512 basic_sub_float32_512 @@ -256,6 +440,14 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) { #define simd_add_int32_512 basic_add_int32_512 #define simd_sub_int32_512 basic_sub_int32_512 #define simd_mul_int32_512 basic_mul_int32_512 + + #define simd_add_float32_512_aligned basic_add_float32_512 + #define simd_sub_float32_512_aligned basic_sub_float32_512 + #define simd_mul_float32_512_aligned basic_mul_float32_512 + #define simd_div_float32_512_aligned basic_div_float32_512 + #define simd_add_int32_512_aligned basic_add_int32_512 + #define simd_sub_int32_512_aligned basic_sub_int32_512 + #define simd_mul_int32_512_aligned basic_mul_int32_512 #endif // SIMD_ENABLE_AVX512 #else @@ -269,22 +461,41 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) { #define simd_mul_float32_128 basic_mul_float32_128 #define simd_div_float32_64 basic_div_float32_64 #define simd_div_float32_128 basic_div_float32_128 +#define simd_add_float32_128_aligned basic_add_float32_128 +#define simd_sub_float32_128_aligned basic_sub_float32_128 +#define simd_mul_float32_128_aligned basic_mul_float32_128 +#define simd_div_float32_128_aligned basic_div_float32_128 // SSE2 #define simd_add_int32_128 basic_add_int32_128 #define simd_sub_int32_128 basic_sub_int32_128 #define simd_mul_int32_128 basic_mul_int32_128 +#define simd_add_int32_128_aligned basic_add_int32_128 +#define simd_sub_int32_128_aligned basic_sub_int32_128 +#define simd_mul_int32_128_aligned basic_mul_int32_128 + +// SSE41 +#define simd_dot_product_float32_64 basic_dot_product_float32_64 +#define simd_dot_product_float32_96 basic_dot_product_float32_96 +#define simd_dot_product_float32_128 basic_dot_product_float32_128 // AVX #define simd_add_float32_256 basic_add_float32_256 #define simd_sub_float32_256 basic_sub_float32_256 #define simd_mul_float32_256 basic_mul_float32_256 #define simd_div_float32_256 basic_div_float32_256 +#define simd_add_float32_256_aligned basic_add_float32_256 +#define simd_sub_float32_256_aligned basic_sub_float32_256 +#define simd_mul_float32_256_aligned basic_mul_float32_256 +#define simd_div_float32_256_aligned basic_div_float32_256 // AVX2 #define simd_add_int32_256 basic_add_int32_256 #define simd_sub_int32_256 basic_sub_int32_256 #define simd_mul_int32_256 basic_mul_int32_256 +#define simd_add_int32_256_aligned basic_add_int32_256 +#define simd_sub_int32_256_aligned basic_sub_int32_256 +#define simd_mul_int32_256_aligned basic_mul_int32_256 // AVX512 #define simd_add_float32_512 basic_add_float32_512 @@ -294,6 +505,13 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) { #define simd_add_int32_512 basic_add_int32_512 #define simd_sub_int32_512 basic_sub_int32_512 #define simd_mul_int32_512 basic_mul_int32_512 +#define simd_add_float32_512_aligned basic_add_float32_512 +#define simd_sub_float32_512_aligned basic_sub_float32_512 +#define simd_mul_float32_512_aligned basic_mul_float32_512 +#define simd_div_float32_512_aligned basic_div_float32_512 +#define simd_add_int32_512_aligned basic_add_int32_512 +#define simd_sub_int32_512_aligned basic_sub_int32_512 +#define simd_mul_int32_512_aligned basic_mul_int32_512 #endif @@ -411,7 +629,15 @@ inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result) { simd_mul_int32_256(a, b, result); simd_mul_int32_256(a+8, b+8, result+8); } - +inline float basic_dot_product_float32_64(float *a, float *b) { + return a[0] * b[0] + a[1] * b[1]; +} +inline float basic_dot_product_float32_96(float *a, float *b) { + return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; +} +inline float basic_dot_product_float32_128(float *a, float *b) { + return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; +} diff --git a/oogabooga/tests.c b/oogabooga/tests.c index 0e5fca8..31a7b0d 100644 --- a/oogabooga/tests.c +++ b/oogabooga/tests.c @@ -636,17 +636,17 @@ void test_simd() { simd_add_float32_64(a_f32, b_f32, result_f32); assert(floats_roughly_match(result_f32[0], a_f32[0]+b_f32[0]), "SIMD add float32 64 failed"); - simd_add_float32_128(a_f32, b_f32, result_f32); + simd_add_float32_128_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 4; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 128 failed"); } - simd_add_float32_256(a_f32, b_f32, result_f32); + simd_add_float32_256_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 8; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 256 failed"); } - simd_add_float32_512(a_f32, b_f32, result_f32); + simd_add_float32_512_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 16; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 512 failed"); } @@ -655,17 +655,17 @@ void test_simd() { simd_sub_float32_64(a_f32, b_f32, result_f32); assert(floats_roughly_match(result_f32[0], a_f32[0] - b_f32[0]), "SIMD sub float32 64 failed"); - simd_sub_float32_128(a_f32, b_f32, result_f32); + simd_sub_float32_128_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 4; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 128 failed"); } - simd_sub_float32_256(a_f32, b_f32, result_f32); + simd_sub_float32_256_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 8; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 256 failed"); } - simd_sub_float32_512(a_f32, b_f32, result_f32); + simd_sub_float32_512_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 16; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 512 failed"); } @@ -674,17 +674,17 @@ void test_simd() { simd_mul_float32_64(a_f32, b_f32, result_f32); assert(floats_roughly_match(result_f32[0], a_f32[0]*b_f32[0]), "SIMD mul float32 64 failed"); - simd_mul_float32_128(a_f32, b_f32, result_f32); + simd_mul_float32_128_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 4; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 128 failed"); } - simd_mul_float32_256(a_f32, b_f32, result_f32); + simd_mul_float32_256_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 8; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 256 failed"); } - simd_mul_float32_512(a_f32, b_f32, result_f32); + simd_mul_float32_512_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 16; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 512 failed"); } @@ -693,65 +693,65 @@ void test_simd() { simd_div_float32_64(a_f32, b_f32, result_f32); assert(floats_roughly_match(result_f32[0], a_f32[0]/b_f32[0]), "SIMD div float32 64 failed"); - simd_div_float32_128(a_f32, b_f32, result_f32); + simd_div_float32_128_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 4; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 128 failed"); } - simd_div_float32_256(a_f32, b_f32, result_f32); + simd_div_float32_256_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 8; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 256 failed, %.5f, %.5f, %.5f", result_f32[i], a_f32[i], a_f32[i] / b_f32[i]); } - simd_div_float32_512(a_f32, b_f32, result_f32); + simd_div_float32_512_aligned(a_f32, b_f32, result_f32); for (int i = 0; i < 16; ++i) { assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 512 failed"); } // Test int32 add - simd_add_int32_128(a_i32, b_i32, result_i32); + simd_add_int32_128_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 4; ++i) { assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 128 failed"); } - simd_add_int32_256(a_i32, b_i32, result_i32); + simd_add_int32_256_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 8; ++i) { assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 256 failed"); } - simd_add_int32_512(a_i32, b_i32, result_i32); + simd_add_int32_512_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 16; ++i) { assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 512 failed"); } // Test int32 subtract - simd_sub_int32_128(a_i32, b_i32, result_i32); + simd_sub_int32_128_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 4; ++i) { assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 128 failed"); } - simd_sub_int32_256(a_i32, b_i32, result_i32); + simd_sub_int32_256_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 8; ++i) { assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 256 failed"); } - simd_sub_int32_512(a_i32, b_i32, result_i32); + simd_sub_int32_512_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 16; ++i) { assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 512 failed"); } // Test int32 multiply - simd_mul_int32_128(a_i32, b_i32, result_i32); + simd_mul_int32_128_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 4; ++i) { assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 128 failed"); } - simd_mul_int32_256(a_i32, b_i32, result_i32); + simd_mul_int32_256_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 8; ++i) { assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 256 failed"); } - simd_mul_int32_512(a_i32, b_i32, result_i32); + simd_mul_int32_512_aligned(a_i32, b_i32, result_i32); for (int i = 0; i < 16; ++i) { assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 512 failed"); } @@ -759,15 +759,17 @@ void test_simd() { #define _TEST_NUM_SAMPLES ((100000 + 64) & ~(63)) assert(_TEST_NUM_SAMPLES % 16 == 0); - float *samples_a = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)); - float *samples_b = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)); + float *samples_a = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)+512); + float *samples_b = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)+512); + samples_a = (float*)(((u64)samples_a+64)&~(63)); + samples_b = (float*)(((u64)samples_b+64)&~(63)); memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float)); memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float)); u64 start = os_get_current_cycle_count(); for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 16) { - simd_mul_float32_512(&samples_a[i], &samples_b[i], &samples_a[i]); + simd_mul_float32_512_aligned(&samples_a[i], &samples_b[i], &samples_a[i]); } u64 end = os_get_current_cycle_count(); @@ -780,7 +782,7 @@ void test_simd() { start = os_get_current_cycle_count(); for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 8) { - simd_mul_float32_256(&samples_a[i], &samples_b[i], &samples_a[i]); + simd_mul_float32_256_aligned(&samples_a[i], &samples_b[i], &samples_a[i]); } end = os_get_current_cycle_count(); @@ -793,7 +795,7 @@ void test_simd() { start = os_get_current_cycle_count(); for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 4) { - simd_mul_float32_128(&samples_a[i], &samples_b[i], &samples_a[i]); + simd_mul_float32_128_aligned(&samples_a[i], &samples_b[i], &samples_a[i]); } end = os_get_current_cycle_count(); @@ -826,7 +828,8 @@ void test_simd() { cycles = end-start; print("NO SIMD float32 mul took %llu cycles\n", cycles); } -void test_math_library() { +// Indirect testing of some simd stuff +void test_linmath() { // Test vector creation and access Vector2 v2_test = v2(1.0f, 2.0f); @@ -1005,6 +1008,15 @@ void test_math_library() { Vector4 mixed_v4 = v4(4.0f, 8.0f, 12.0f, 16.0f); Vector4 mixed_v4_result = v4_mulf(mixed_v4, 0.25f); assert(mixed_v4_result.x == 1.0f && mixed_v4_result.y == 2.0f && mixed_v4_result.z == 3.0f && mixed_v4_result.w == 4.0f, "Mixed Vector4 scalar multiplication failed"); + + + float v2_dot = v2_dot_product(v2(2, 7), v2(3, 2)); + float v3_dot = v3_dot_product(v3(2, 7, 2), v3(3, 2, 9)); + float v4_dot = v4_dot_product(v4(2, 7, 6, 1), v4(3, 2, 1, 4)); + + assert(floats_roughly_match(v2_dot, 20), "Failed: v2_dot_product"); + assert(floats_roughly_match(v3_dot, 38), "Failed: v3_dot_product"); + assert(floats_roughly_match(v4_dot, 30), "Failed: v4_dot_product"); } void oogabooga_run_tests() { @@ -1022,23 +1034,23 @@ void oogabooga_run_tests() { print("OK!\n"); - print("Thread bombing allocator... "); - Thread* threads[100]; - for (int i = 0; i < 100; i++) { - threads[i] = os_make_thread(test_allocator_threaded, get_heap_allocator()); - os_start_thread(threads[i]); - } - for (int i = 0; i < 100; i++) { - os_join_thread(threads[i]); - } - print("OK!\n"); + //print("Thread bombing allocator... "); + //Thread* threads[100]; + //for (int i = 0; i < 100; i++) { + // threads[i] = os_make_thread(test_allocator_threaded, get_heap_allocator()); + // os_start_thread(threads[i]); + //} + //for (int i = 0; i < 100; i++) { + // os_join_thread(threads[i]); + //} + //print("OK!\n"); print("Testing file IO... "); test_file_io(); print("OK!\n"); - print("Testing file IO... "); - test_math_library(); + print("Testing linmath... "); + test_linmath(); print("OK!\n"); print("Testing simd... ");