Fixy smixy

This commit is contained in:
Charlie 2024-07-04 12:18:16 +02:00
parent 439f00caf4
commit 5db73b90e9
7 changed files with 305 additions and 55 deletions

View file

@ -6,6 +6,6 @@ mkdir build
pushd build pushd build
clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -msse4.1
popd popd

View file

@ -3,14 +3,14 @@
/// ///
// Build config stuff // Build config stuff
#define RUN_TESTS 0 #define RUN_TESTS 1
// This is only for people developing oogabooga! // This is only for people developing oogabooga!
#define OOGABOOGA_DEV 1 #define OOGABOOGA_DEV 1
#define ENABLE_PROFILING 0 #define ENABLE_PROFILING 0
// Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which don't. // ENABLE_SIMD Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which doesn't
#define ENABLE_SIMD 1 #define ENABLE_SIMD 1
#define INITIAL_PROGRAM_MEMORY_SIZE MB(5) #define INITIAL_PROGRAM_MEMORY_SIZE MB(5)

View file

@ -7,7 +7,7 @@ pushd build
mkdir release mkdir release
pushd release pushd release
clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -mavx2 -mavx512f -msse4.1 -msse2 -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -msse4.1
popd popd
popd popd

View file

@ -27,6 +27,7 @@ typedef struct Cpu_Capabilities {
// Compiler specific stuff // Compiler specific stuff
#if COMPILER_MVSC #if COMPILER_MVSC
#define inline __forceinline #define inline __forceinline
#define alignat(x) __declspec(align(x))
#define COMPILER_HAS_MEMCPY_INTRINSICS 1 #define COMPILER_HAS_MEMCPY_INTRINSICS 1
#include <intrin.h> #include <intrin.h>
#pragma intrinsic(__rdtsc) #pragma intrinsic(__rdtsc)
@ -63,6 +64,7 @@ typedef struct Cpu_Capabilities {
#endif #endif
#elif COMPILER_GCC || COMPILER_CLANG #elif COMPILER_GCC || COMPILER_CLANG
#define inline __attribute__((always_inline)) inline #define inline __attribute__((always_inline)) inline
#define alignat(x) __attribute__((aligned(x)))
#define COMPILER_HAS_MEMCPY_INTRINSICS 1 #define COMPILER_HAS_MEMCPY_INTRINSICS 1
inline u64 rdtsc() { inline u64 rdtsc() {
unsigned int lo, hi; unsigned int lo, hi;

View file

@ -13,13 +13,13 @@
#define to_radians32 to_radians #define to_radians32 to_radians
#define to_degrees32 to_degrees #define to_degrees32 to_degrees
typedef union Vector2 { typedef alignat(16) union Vector2 {
struct {float32 x, y;}; struct {float32 x, y;};
} Vector2; } Vector2;
inline Vector2 v2(float32 x, float32 y) { return (Vector2){x, y}; } inline Vector2 v2(float32 x, float32 y) { return (Vector2){x, y}; }
#define v2_expand(v) (v).x, (v).y #define v2_expand(v) (v).x, (v).y
typedef union Vector3 { typedef alignat(16) union Vector3 {
struct {float32 x, y, z;}; struct {float32 x, y, z;};
struct {float32 r, g, b;}; struct {float32 r, g, b;};
struct {Vector2 xy;}; struct {Vector2 xy;};
@ -28,7 +28,7 @@ typedef union Vector3 {
inline Vector3 v3(float32 x, float32 y, float32 z) { return (Vector3){x, y, z}; } inline Vector3 v3(float32 x, float32 y, float32 z) { return (Vector3){x, y, z}; }
#define v3_expand(v) (v).x, (v).y, (v).z #define v3_expand(v) (v).x, (v).y, (v).z
typedef union Vector4 { typedef alignat(16) union Vector4 {
struct {float32 x, y, z, w;}; struct {float32 x, y, z, w;};
struct {float32 x1, y1, x2, y2;}; struct {float32 x1, y1, x2, y2;};
struct {float32 r, g, b, a;}; struct {float32 r, g, b, a;};
@ -66,19 +66,19 @@ inline Vector2 v2_divf(Vector2 a, float32 s) {
inline Vector3 v3_add(Vector3 a, Vector3 b) { inline Vector3 v3_add(Vector3 a, Vector3 b) {
Vector4 a128 = v4(a.x, a.y, a.z, 0.0); Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
Vector4 b128 = v4(b.x, b.y, b.z, 0.0); Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
simd_add_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); simd_add_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128);
return a128.xyz; return a128.xyz;
} }
inline Vector3 v3_sub(Vector3 a, Vector3 b) { inline Vector3 v3_sub(Vector3 a, Vector3 b) {
Vector4 a128 = v4(a.x, a.y, a.z, 0.0); Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
Vector4 b128 = v4(b.x, b.y, b.z, 0.0); Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
simd_sub_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); simd_sub_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128);
return a128.xyz; return a128.xyz;
} }
inline Vector3 v3_mul(Vector3 a, Vector3 b) { inline Vector3 v3_mul(Vector3 a, Vector3 b) {
Vector4 a128 = v4(a.x, a.y, a.z, 0.0); Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
Vector4 b128 = v4(b.x, b.y, b.z, 0.0); Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
simd_mul_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); simd_mul_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128);
return a128.xyz; return a128.xyz;
} }
inline Vector3 v3_mulf(Vector3 a, float32 s) { inline Vector3 v3_mulf(Vector3 a, float32 s) {
@ -87,7 +87,7 @@ inline Vector3 v3_mulf(Vector3 a, float32 s) {
inline Vector3 v3_div(Vector3 a, Vector3 b) { inline Vector3 v3_div(Vector3 a, Vector3 b) {
Vector4 a128 = v4(a.x, a.y, a.z, 0.0); Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
Vector4 b128 = v4(b.x, b.y, b.z, 0.0); Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
simd_div_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128); simd_div_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128);
return a128.xyz; return a128.xyz;
} }
inline Vector3 v3_divf(Vector3 a, float32 s) { inline Vector3 v3_divf(Vector3 a, float32 s) {
@ -95,28 +95,29 @@ inline Vector3 v3_divf(Vector3 a, float32 s) {
} }
inline Vector4 v4_add(Vector4 a, Vector4 b) { inline Vector4 v4_add(Vector4 a, Vector4 b) {
simd_add_float32_128((f32*)&a, (f32*)&b, (f32*)&a); simd_add_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a);
return a; return a;
} }
inline Vector4 v4_sub(Vector4 a, Vector4 b) { inline Vector4 v4_sub(Vector4 a, Vector4 b) {
simd_sub_float32_128((f32*)&a, (f32*)&b, (f32*)&a); simd_sub_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a);
return a; return a;
} }
inline Vector4 v4_mul(Vector4 a, Vector4 b) { inline Vector4 v4_mul(Vector4 a, Vector4 b) {
simd_mul_float32_128((f32*)&a, (f32*)&b, (f32*)&a); simd_mul_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a);
return a; return a;
} }
inline Vector4 v4_mulf(Vector4 a, float32 s) { inline Vector4 v4_mulf(Vector4 a, float32 s) {
return v4_mul(a, v4(s, s, s, s)); return v4_mul(a, v4(s, s, s, s));
} }
inline Vector4 v4_div(Vector4 a, Vector4 b) { inline Vector4 v4_div(Vector4 a, Vector4 b) {
simd_div_float32_128((f32*)&a, (f32*)&b, (f32*)&a); simd_div_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a);
return a; return a;
} }
inline Vector4 v4_divf(Vector4 a, float32 s) { inline Vector4 v4_divf(Vector4 a, float32 s) {
return v4_div(a, v4(s, s, s, s)); return v4_div(a, v4(s, s, s, s));
} }
inline Vector2 v2_normalize(Vector2 a) { inline Vector2 v2_normalize(Vector2 a) {
float32 length = sqrt(a.x * a.x + a.y * a.y); float32 length = sqrt(a.x * a.x + a.y * a.y);
if (length == 0) { if (length == 0) {
@ -125,6 +126,15 @@ inline Vector2 v2_normalize(Vector2 a) {
return v2_divf(a, length); return v2_divf(a, length);
} }
inline float v2_dot_product(Vector2 a, Vector2 b) {
return simd_dot_product_float32_64((float*)&a, (float*)&b);
}
inline float v3_dot_product(Vector3 a, Vector3 b) {
return simd_dot_product_float32_96((float*)&a, (float*)&b);
}
inline float v4_dot_product(Vector4 a, Vector4 b) {
return simd_dot_product_float32_128((float*)&a, (float*)&b);
}
Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) { Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) {
float32 s = sin(rotation_radians); float32 s = sin(rotation_radians);

View file

@ -27,6 +27,10 @@ inline void basic_mul_int32_128(s32 *a, s32 *b, s32* result);
inline void basic_mul_int32_256(s32 *a, s32 *b, s32* result); inline void basic_mul_int32_256(s32 *a, s32 *b, s32* result);
inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result); inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result);
inline float basic_dot_product_float32_64(float *a, float *b);
inline float basic_dot_product_float32_96(float *a, float *b);
inline float basic_dot_product_float32_128(float *a, float *b);
#if ENABLE_SIMD #if ENABLE_SIMD
@ -74,6 +78,12 @@ inline void simd_add_float32_128(float *a, float *b, float* result) {
__m128 vr = _mm_add_ps(va, vb); __m128 vr = _mm_add_ps(va, vb);
_mm_storeu_ps(result, vr); _mm_storeu_ps(result, vr);
} }
inline void simd_add_float32_128_aligned(float *a, float *b, float* result) {
__m128 va = _mm_load_ps(a);
__m128 vb = _mm_load_ps(b);
__m128 vr = _mm_add_ps(va, vb);
_mm_store_ps(result, vr);
}
inline void simd_sub_float32_128(float *a, float *b, float* result) { inline void simd_sub_float32_128(float *a, float *b, float* result) {
__m128 va = _mm_loadu_ps(a); __m128 va = _mm_loadu_ps(a);
@ -81,6 +91,12 @@ inline void simd_sub_float32_128(float *a, float *b, float* result) {
__m128 vr = _mm_sub_ps(va, vb); __m128 vr = _mm_sub_ps(va, vb);
_mm_storeu_ps(result, vr); _mm_storeu_ps(result, vr);
} }
inline void simd_sub_float32_128_aligned(float *a, float *b, float* result) {
__m128 va = _mm_load_ps(a);
__m128 vb = _mm_load_ps(b);
__m128 vr = _mm_sub_ps(va, vb);
_mm_store_ps(result, vr);
}
inline void simd_mul_float32_128(float *a, float *b, float* result) { inline void simd_mul_float32_128(float *a, float *b, float* result) {
__m128 va = _mm_loadu_ps(a); __m128 va = _mm_loadu_ps(a);
@ -88,6 +104,12 @@ inline void simd_mul_float32_128(float *a, float *b, float* result) {
__m128 vr = _mm_mul_ps(va, vb); __m128 vr = _mm_mul_ps(va, vb);
_mm_storeu_ps(result, vr); _mm_storeu_ps(result, vr);
} }
inline void simd_mul_float32_128_aligned(float *a, float *b, float* result) {
__m128 va = _mm_load_ps(a);
__m128 vb = _mm_load_ps(b);
__m128 vr = _mm_mul_ps(va, vb);
_mm_store_ps(result, vr);
}
inline void simd_div_float32_128(float *a, float *b, float* result) { inline void simd_div_float32_128(float *a, float *b, float* result) {
__m128 va = _mm_loadu_ps(a); __m128 va = _mm_loadu_ps(a);
@ -95,6 +117,13 @@ inline void simd_div_float32_128(float *a, float *b, float* result) {
__m128 vr = _mm_div_ps(va, vb); __m128 vr = _mm_div_ps(va, vb);
_mm_storeu_ps(result, vr); _mm_storeu_ps(result, vr);
} }
inline void simd_div_float32_128_aligned(float *a, float *b, float* result) {
__m128 va = _mm_load_ps(a);
__m128 vb = _mm_load_ps(b);
__m128 vr = _mm_div_ps(va, vb);
_mm_store_ps(result, vr);
}
#if SIMD_ENABLE_SSE2 #if SIMD_ENABLE_SSE2
// SSE2 // SSE2
@ -107,17 +136,32 @@ inline void simd_add_int32_128(s32 *a, s32 *b, s32* result) {
__m128i vr = _mm_add_epi32(va, vb); __m128i vr = _mm_add_epi32(va, vb);
_mm_storeu_si128((__m128i*)result, vr); _mm_storeu_si128((__m128i*)result, vr);
} }
inline void simd_add_int32_128_aligned(s32 *a, s32 *b, s32* result) {
__m128i va = _mm_load_si128((__m128i*)a);
__m128i vb = _mm_load_si128((__m128i*)b);
__m128i vr = _mm_add_epi32(va, vb);
_mm_store_si128((__m128i*)result, vr);
}
inline void simd_sub_int32_128(s32 *a, s32 *b, s32* result) { inline void simd_sub_int32_128(s32 *a, s32 *b, s32* result) {
__m128i va = _mm_loadu_si128((__m128i*)a); __m128i va = _mm_loadu_si128((__m128i*)a);
__m128i vb = _mm_loadu_si128((__m128i*)b); __m128i vb = _mm_loadu_si128((__m128i*)b);
__m128i vr = _mm_sub_epi32(va, vb); __m128i vr = _mm_sub_epi32(va, vb);
_mm_storeu_si128((__m128i*)result, vr); _mm_storeu_si128((__m128i*)result, vr);
} }
inline void simd_sub_int32_128_aligned(s32 *a, s32 *b, s32* result) {
__m128i va = _mm_load_si128((__m128i*)a);
__m128i vb = _mm_load_si128((__m128i*)b);
__m128i vr = _mm_sub_epi32(va, vb);
_mm_store_si128((__m128i*)result, vr);
}
#else #else
#define simd_add_int32_128 basic_add_int32_128 #define simd_add_int32_128 basic_add_int32_128
#define simd_sub_int32_128 basic_sub_int32_128 #define simd_sub_int32_128 basic_sub_int32_128
#define simd_add_int32_128_aligned basic_add_int32_128
#define simd_sub_int32_128_aligned basic_sub_int32_128
#endif #endif
#if SIMD_ENABLE_SSE41 #if SIMD_ENABLE_SSE41
@ -127,8 +171,55 @@ inline void simd_mul_int32_128(s32 *a, s32 *b, s32* result) {
__m128i vr = _mm_mullo_epi32(va, vb); __m128i vr = _mm_mullo_epi32(va, vb);
_mm_storeu_si128((__m128i*)result, vr); _mm_storeu_si128((__m128i*)result, vr);
} }
inline void simd_mul_int32_128_aligned(s32 *a, s32 *b, s32* result) {
__m128i va = _mm_load_si128((__m128i*)a);
__m128i vb = _mm_load_si128((__m128i*)b);
__m128i vr = _mm_mullo_epi32(va, vb);
_mm_store_si128((__m128i*)result, vr);
}
inline float simd_dot_product_float32_64(float *a, float *b) {
__m128 vec1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a);
__m128 vec2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)b);
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x31);
return _mm_cvtss_f32(dot_product);
}
inline float simd_dot_product_float32_96(float *a, float *b) {
__m128 vec1 = _mm_loadu_ps(a);
__m128 vec2 = _mm_loadu_ps(b);
vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
return _mm_cvtss_f32(dot_product);
}
inline float simd_dot_product_float32_96_aligned(float *a, float *b) {
__m128 vec1 = _mm_load_ps(a);
__m128 vec2 = _mm_load_ps(b);
vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
return _mm_cvtss_f32(dot_product);
}
inline float simd_dot_product_float32_128(float *a, float *b) {
__m128 vec1 = _mm_loadu_ps(a);
__m128 vec2 = _mm_loadu_ps(b);
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0xF1);
return _mm_cvtss_f32(dot_product);
}
inline float simd_dot_product_float32_128_aligned(float *a, float *b) {
__m128 vec1 = _mm_load_ps(a);
__m128 vec2 = _mm_load_ps(b);
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0xF1);
return _mm_cvtss_f32(dot_product);
}
#else #else
#define simd_mul_int32_128 basic_mul_int32_128 #define simd_mul_int32_128 basic_mul_int32_128
#define simd_mul_int32_128_aligned basic_mul_int32_128
#define simd_dot_product_float32_64 basic_dot_product_float32_64
#define simd_dot_product_float32_96 basic_dot_product_float32_96
#define simd_dot_product_float32_128 basic_dot_product_float32_128
#define simd_dot_product_float32_64_aligned basic_dot_product_float32_64
#define simd_dot_product_float32_96_aligned basic_dot_product_float32_96
#define simd_dot_product_float32_128_aligned basic_dot_product_float32_128
#endif // SIMD_ENABLE_SSE41 #endif // SIMD_ENABLE_SSE41
#if SIMD_ENABLE_AVX #if SIMD_ENABLE_AVX
@ -142,29 +233,58 @@ inline void simd_add_float32_256(float32 *a, float32 *b, float32* result) {
__m256 vr = _mm256_add_ps(va, vb); __m256 vr = _mm256_add_ps(va, vb);
_mm256_storeu_ps(result, vr); _mm256_storeu_ps(result, vr);
} }
inline void simd_add_float32_256_aligned(float32 *a, float32 *b, float32* result) {
__m256 va = _mm256_load_ps(a);
__m256 vb = _mm256_load_ps(b);
__m256 vr = _mm256_add_ps(va, vb);
_mm256_store_ps(result, vr);
}
inline void simd_sub_float32_256(float32 *a, float32 *b, float32* result) { inline void simd_sub_float32_256(float32 *a, float32 *b, float32* result) {
__m256 va = _mm256_loadu_ps(a); __m256 va = _mm256_loadu_ps(a);
__m256 vb = _mm256_loadu_ps(b); __m256 vb = _mm256_loadu_ps(b);
__m256 vr = _mm256_sub_ps(va, vb); __m256 vr = _mm256_sub_ps(va, vb);
_mm256_storeu_ps(result, vr); _mm256_storeu_ps(result, vr);
} }
inline void simd_sub_float32_256_aligned(float32 *a, float32 *b, float32* result) {
__m256 va = _mm256_load_ps(a);
__m256 vb = _mm256_load_ps(b);
__m256 vr = _mm256_sub_ps(va, vb);
_mm256_store_ps(result, vr);
}
inline void simd_mul_float32_256(float32 *a, float32 *b, float32* result) { inline void simd_mul_float32_256(float32 *a, float32 *b, float32* result) {
__m256 va = _mm256_loadu_ps(a); __m256 va = _mm256_loadu_ps(a);
__m256 vb = _mm256_loadu_ps(b); __m256 vb = _mm256_loadu_ps(b);
__m256 vr = _mm256_mul_ps(va, vb); __m256 vr = _mm256_mul_ps(va, vb);
_mm256_storeu_ps(result, vr); _mm256_storeu_ps(result, vr);
} }
inline void simd_mul_float32_256_aligned(float32 *a, float32 *b, float32* result) {
__m256 va = _mm256_load_ps(a);
__m256 vb = _mm256_load_ps(b);
__m256 vr = _mm256_mul_ps(va, vb);
_mm256_store_ps(result, vr);
}
inline void simd_div_float32_256(float32 *a, float32 *b, float32* result){ inline void simd_div_float32_256(float32 *a, float32 *b, float32* result){
__m256 va = _mm256_loadu_ps(a); __m256 va = _mm256_loadu_ps(a);
__m256 vb = _mm256_loadu_ps(b); __m256 vb = _mm256_loadu_ps(b);
__m256 vr = _mm256_div_ps(va, vb); __m256 vr = _mm256_div_ps(va, vb);
_mm256_storeu_ps(result, vr); _mm256_storeu_ps(result, vr);
} }
inline void simd_div_float32_256_aligned(float32 *a, float32 *b, float32* result){
__m256 va = _mm256_load_ps(a);
__m256 vb = _mm256_load_ps(b);
__m256 vr = _mm256_div_ps(va, vb);
_mm256_store_ps(result, vr);
}
#else #else
#define simd_add_float32_256 basic_add_float32_256 #define simd_add_float32_256 basic_add_float32_256
#define simd_sub_float32_256 basic_sub_float32_256 #define simd_sub_float32_256 basic_sub_float32_256
#define simd_mul_float32_256 basic_mul_float32_256 #define simd_mul_float32_256 basic_mul_float32_256
#define simd_div_float32_256 basic_div_float32_256 #define simd_div_float32_256 basic_div_float32_256
#define simd_add_float32_256_aligned basic_add_float32_256
#define simd_sub_float32_256_aligned basic_sub_float32_256
#define simd_mul_float32_256_aligned basic_mul_float32_256
#define simd_div_float32_256_aligned basic_div_float32_256
#endif #endif
#if SIMD_ENABLE_AVX2 #if SIMD_ENABLE_AVX2
@ -178,22 +298,44 @@ inline void simd_add_int32_256(s32 *a, s32 *b, s32* result) {
__m256i vr = _mm256_add_epi32(va, vb); __m256i vr = _mm256_add_epi32(va, vb);
_mm256_storeu_si256((__m256i*)result, vr); _mm256_storeu_si256((__m256i*)result, vr);
} }
inline void simd_add_int32_256_aligned(s32 *a, s32 *b, s32* result) {
__m256i va = _mm256_load_si256((__m256i*)a);
__m256i vb = _mm256_load_si256((__m256i*)b);
__m256i vr = _mm256_add_epi32(va, vb);
_mm256_store_si256((__m256i*)result, vr);
}
inline void simd_sub_int32_256(s32 *a, s32 *b, s32* result) { inline void simd_sub_int32_256(s32 *a, s32 *b, s32* result) {
__m256i va = _mm256_loadu_si256((__m256i*)a); __m256i va = _mm256_loadu_si256((__m256i*)a);
__m256i vb = _mm256_loadu_si256((__m256i*)b); __m256i vb = _mm256_loadu_si256((__m256i*)b);
__m256i vr = _mm256_sub_epi32(va, vb); __m256i vr = _mm256_sub_epi32(va, vb);
_mm256_storeu_si256((__m256i*)result, vr); _mm256_storeu_si256((__m256i*)result, vr);
} }
inline void simd_sub_int32_256_aligned(s32 *a, s32 *b, s32* result) {
__m256i va = _mm256_load_si256((__m256i*)a);
__m256i vb = _mm256_load_si256((__m256i*)b);
__m256i vr = _mm256_sub_epi32(va, vb);
_mm256_store_si256((__m256i*)result, vr);
}
inline void simd_mul_int32_256(s32 *a, s32 *b, s32* result) { inline void simd_mul_int32_256(s32 *a, s32 *b, s32* result) {
__m256i va = _mm256_loadu_si256((__m256i*)a); __m256i va = _mm256_loadu_si256((__m256i*)a);
__m256i vb = _mm256_loadu_si256((__m256i*)b); __m256i vb = _mm256_loadu_si256((__m256i*)b);
__m256i vr = _mm256_mullo_epi32(va, vb); __m256i vr = _mm256_mullo_epi32(va, vb);
_mm256_storeu_si256((__m256i*)result, vr); _mm256_storeu_si256((__m256i*)result, vr);
} }
inline void simd_mul_int32_256_aligned(s32 *a, s32 *b, s32* result) {
__m256i va = _mm256_load_si256((__m256i*)a);
__m256i vb = _mm256_load_si256((__m256i*)b);
__m256i vr = _mm256_mullo_epi32(va, vb);
_mm256_store_si256((__m256i*)result, vr);
}
#else #else
#define simd_add_int32_256 basic_add_int32_256 #define simd_add_int32_256 basic_add_int32_256
#define simd_sub_int32_256 basic_sub_int32_256 #define simd_sub_int32_256 basic_sub_int32_256
#define simd_mul_int32_256 basic_mul_int32_256 #define simd_mul_int32_256 basic_mul_int32_256
#define simd_add_int32_256_aligned basic_add_int32_256
#define simd_sub_int32_256_aligned basic_sub_int32_256
#define simd_mul_int32_256_aligned basic_mul_int32_256
#endif #endif
#if SIMD_ENABLE_AVX512 #if SIMD_ENABLE_AVX512
@ -207,6 +349,12 @@ inline void simd_add_float32_512(float *a, float *b, float* result) {
__m512 vr = _mm512_add_ps(va, vb); __m512 vr = _mm512_add_ps(va, vb);
_mm512_storeu_ps(result, vr); _mm512_storeu_ps(result, vr);
} }
inline void simd_add_float32_512_aligned(float *a, float *b, float* result) {
__m512 va = _mm512_load_ps(a);
__m512 vb = _mm512_load_ps(b);
__m512 vr = _mm512_add_ps(va, vb);
_mm512_store_ps(result, vr);
}
inline void simd_sub_float32_512(float *a, float *b, float* result) { inline void simd_sub_float32_512(float *a, float *b, float* result) {
__m512 va = _mm512_loadu_ps(a); __m512 va = _mm512_loadu_ps(a);
@ -214,6 +362,12 @@ inline void simd_sub_float32_512(float *a, float *b, float* result) {
__m512 vr = _mm512_sub_ps(va, vb); __m512 vr = _mm512_sub_ps(va, vb);
_mm512_storeu_ps(result, vr); _mm512_storeu_ps(result, vr);
} }
inline void simd_sub_float32_512_aligned(float *a, float *b, float* result) {
__m512 va = _mm512_load_ps(a);
__m512 vb = _mm512_load_ps(b);
__m512 vr = _mm512_sub_ps(va, vb);
_mm512_store_ps(result, vr);
}
inline void simd_mul_float32_512(float *a, float *b, float* result) { inline void simd_mul_float32_512(float *a, float *b, float* result) {
__m512 va = _mm512_loadu_ps(a); __m512 va = _mm512_loadu_ps(a);
@ -221,6 +375,12 @@ inline void simd_mul_float32_512(float *a, float *b, float* result) {
__m512 vr = _mm512_mul_ps(va, vb); __m512 vr = _mm512_mul_ps(va, vb);
_mm512_storeu_ps(result, vr); _mm512_storeu_ps(result, vr);
} }
inline void simd_mul_float32_512_aligned(float *a, float *b, float* result) {
__m512 va = _mm512_load_ps(a);
__m512 vb = _mm512_load_ps(b);
__m512 vr = _mm512_mul_ps(va, vb);
_mm512_store_ps(result, vr);
}
inline void simd_div_float32_512(float *a, float *b, float* result) { inline void simd_div_float32_512(float *a, float *b, float* result) {
__m512 va = _mm512_loadu_ps(a); __m512 va = _mm512_loadu_ps(a);
@ -228,12 +388,24 @@ inline void simd_div_float32_512(float *a, float *b, float* result) {
__m512 vr = _mm512_div_ps(va, vb); __m512 vr = _mm512_div_ps(va, vb);
_mm512_storeu_ps(result, vr); _mm512_storeu_ps(result, vr);
} }
inline void simd_div_float32_512_aligned(float *a, float *b, float* result) {
__m512 va = _mm512_load_ps(a);
__m512 vb = _mm512_load_ps(b);
__m512 vr = _mm512_div_ps(va, vb);
_mm512_store_ps(result, vr);
}
inline void simd_add_int32_512(int32 *a, int32 *b, int32* result) { inline void simd_add_int32_512(int32 *a, int32 *b, int32* result) {
__m512i va = _mm512_loadu_si512((__m512i*)a); __m512i va = _mm512_loadu_si512((__m512i*)a);
__m512i vb = _mm512_loadu_si512((__m512i*)b); __m512i vb = _mm512_loadu_si512((__m512i*)b);
__m512i vr = _mm512_add_epi32(va, vb); __m512i vr = _mm512_add_epi32(va, vb);
_mm512_storeu_si512((__m512i*)result, vr); _mm512_storeu_si512((__m512i*)result, vr);
} }
inline void simd_add_int32_512_aligned(int32 *a, int32 *b, int32* result) {
__m512i va = _mm512_load_si512((__m512i*)a);
__m512i vb = _mm512_load_si512((__m512i*)b);
__m512i vr = _mm512_add_epi32(va, vb);
_mm512_store_si512((__m512i*)result, vr);
}
inline void simd_sub_int32_512(int32 *a, int32 *b, int32* result) { inline void simd_sub_int32_512(int32 *a, int32 *b, int32* result) {
__m512i va = _mm512_loadu_si512((__m512i*)a); __m512i va = _mm512_loadu_si512((__m512i*)a);
@ -241,6 +413,12 @@ inline void simd_sub_int32_512(int32 *a, int32 *b, int32* result) {
__m512i vr = _mm512_sub_epi32(va, vb); __m512i vr = _mm512_sub_epi32(va, vb);
_mm512_storeu_si512((__m512i*)result, vr); _mm512_storeu_si512((__m512i*)result, vr);
} }
inline void simd_sub_int32_512_aligned(int32 *a, int32 *b, int32* result) {
__m512i va = _mm512_load_si512((__m512i*)a);
__m512i vb = _mm512_load_si512((__m512i*)b);
__m512i vr = _mm512_sub_epi32(va, vb);
_mm512_store_si512((__m512i*)result, vr);
}
inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) { inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
__m512i va = _mm512_loadu_si512((__m512i*)a); __m512i va = _mm512_loadu_si512((__m512i*)a);
@ -248,6 +426,12 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
__m512i vr = _mm512_mullo_epi32(va, vb); __m512i vr = _mm512_mullo_epi32(va, vb);
_mm512_storeu_si512((__m512i*)result, vr); _mm512_storeu_si512((__m512i*)result, vr);
} }
inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
__m512i va = _mm512_load_si512((__m512i*)a);
__m512i vb = _mm512_load_si512((__m512i*)b);
__m512i vr = _mm512_mullo_epi32(va, vb);
_mm512_store_si512((__m512i*)result, vr);
}
#else #else
#define simd_add_float32_512 basic_add_float32_512 #define simd_add_float32_512 basic_add_float32_512
#define simd_sub_float32_512 basic_sub_float32_512 #define simd_sub_float32_512 basic_sub_float32_512
@ -256,6 +440,14 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512 basic_add_int32_512 #define simd_add_int32_512 basic_add_int32_512
#define simd_sub_int32_512 basic_sub_int32_512 #define simd_sub_int32_512 basic_sub_int32_512
#define simd_mul_int32_512 basic_mul_int32_512 #define simd_mul_int32_512 basic_mul_int32_512
#define simd_add_float32_512_aligned basic_add_float32_512
#define simd_sub_float32_512_aligned basic_sub_float32_512
#define simd_mul_float32_512_aligned basic_mul_float32_512
#define simd_div_float32_512_aligned basic_div_float32_512
#define simd_add_int32_512_aligned basic_add_int32_512
#define simd_sub_int32_512_aligned basic_sub_int32_512
#define simd_mul_int32_512_aligned basic_mul_int32_512
#endif // SIMD_ENABLE_AVX512 #endif // SIMD_ENABLE_AVX512
#else #else
@ -269,22 +461,41 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
#define simd_mul_float32_128 basic_mul_float32_128 #define simd_mul_float32_128 basic_mul_float32_128
#define simd_div_float32_64 basic_div_float32_64 #define simd_div_float32_64 basic_div_float32_64
#define simd_div_float32_128 basic_div_float32_128 #define simd_div_float32_128 basic_div_float32_128
#define simd_add_float32_128_aligned basic_add_float32_128
#define simd_sub_float32_128_aligned basic_sub_float32_128
#define simd_mul_float32_128_aligned basic_mul_float32_128
#define simd_div_float32_128_aligned basic_div_float32_128
// SSE2 // SSE2
#define simd_add_int32_128 basic_add_int32_128 #define simd_add_int32_128 basic_add_int32_128
#define simd_sub_int32_128 basic_sub_int32_128 #define simd_sub_int32_128 basic_sub_int32_128
#define simd_mul_int32_128 basic_mul_int32_128 #define simd_mul_int32_128 basic_mul_int32_128
#define simd_add_int32_128_aligned basic_add_int32_128
#define simd_sub_int32_128_aligned basic_sub_int32_128
#define simd_mul_int32_128_aligned basic_mul_int32_128
// SSE41
#define simd_dot_product_float32_64 basic_dot_product_float32_64
#define simd_dot_product_float32_96 basic_dot_product_float32_96
#define simd_dot_product_float32_128 basic_dot_product_float32_128
// AVX // AVX
#define simd_add_float32_256 basic_add_float32_256 #define simd_add_float32_256 basic_add_float32_256
#define simd_sub_float32_256 basic_sub_float32_256 #define simd_sub_float32_256 basic_sub_float32_256
#define simd_mul_float32_256 basic_mul_float32_256 #define simd_mul_float32_256 basic_mul_float32_256
#define simd_div_float32_256 basic_div_float32_256 #define simd_div_float32_256 basic_div_float32_256
#define simd_add_float32_256_aligned basic_add_float32_256
#define simd_sub_float32_256_aligned basic_sub_float32_256
#define simd_mul_float32_256_aligned basic_mul_float32_256
#define simd_div_float32_256_aligned basic_div_float32_256
// AVX2 // AVX2
#define simd_add_int32_256 basic_add_int32_256 #define simd_add_int32_256 basic_add_int32_256
#define simd_sub_int32_256 basic_sub_int32_256 #define simd_sub_int32_256 basic_sub_int32_256
#define simd_mul_int32_256 basic_mul_int32_256 #define simd_mul_int32_256 basic_mul_int32_256
#define simd_add_int32_256_aligned basic_add_int32_256
#define simd_sub_int32_256_aligned basic_sub_int32_256
#define simd_mul_int32_256_aligned basic_mul_int32_256
// AVX512 // AVX512
#define simd_add_float32_512 basic_add_float32_512 #define simd_add_float32_512 basic_add_float32_512
@ -294,6 +505,13 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
#define simd_add_int32_512 basic_add_int32_512 #define simd_add_int32_512 basic_add_int32_512
#define simd_sub_int32_512 basic_sub_int32_512 #define simd_sub_int32_512 basic_sub_int32_512
#define simd_mul_int32_512 basic_mul_int32_512 #define simd_mul_int32_512 basic_mul_int32_512
#define simd_add_float32_512_aligned basic_add_float32_512
#define simd_sub_float32_512_aligned basic_sub_float32_512
#define simd_mul_float32_512_aligned basic_mul_float32_512
#define simd_div_float32_512_aligned basic_div_float32_512
#define simd_add_int32_512_aligned basic_add_int32_512
#define simd_sub_int32_512_aligned basic_sub_int32_512
#define simd_mul_int32_512_aligned basic_mul_int32_512
#endif #endif
@ -411,7 +629,15 @@ inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result) {
simd_mul_int32_256(a, b, result); simd_mul_int32_256(a, b, result);
simd_mul_int32_256(a+8, b+8, result+8); simd_mul_int32_256(a+8, b+8, result+8);
} }
inline float basic_dot_product_float32_64(float *a, float *b) {
return a[0] * b[0] + a[1] * b[1];
}
inline float basic_dot_product_float32_96(float *a, float *b) {
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
}
inline float basic_dot_product_float32_128(float *a, float *b) {
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
}

View file

@ -636,17 +636,17 @@ void test_simd() {
simd_add_float32_64(a_f32, b_f32, result_f32); simd_add_float32_64(a_f32, b_f32, result_f32);
assert(floats_roughly_match(result_f32[0], a_f32[0]+b_f32[0]), "SIMD add float32 64 failed"); assert(floats_roughly_match(result_f32[0], a_f32[0]+b_f32[0]), "SIMD add float32 64 failed");
simd_add_float32_128(a_f32, b_f32, result_f32); simd_add_float32_128_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 128 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 128 failed");
} }
simd_add_float32_256(a_f32, b_f32, result_f32); simd_add_float32_256_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 256 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 256 failed");
} }
simd_add_float32_512(a_f32, b_f32, result_f32); simd_add_float32_512_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 512 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 512 failed");
} }
@ -655,17 +655,17 @@ void test_simd() {
simd_sub_float32_64(a_f32, b_f32, result_f32); simd_sub_float32_64(a_f32, b_f32, result_f32);
assert(floats_roughly_match(result_f32[0], a_f32[0] - b_f32[0]), "SIMD sub float32 64 failed"); assert(floats_roughly_match(result_f32[0], a_f32[0] - b_f32[0]), "SIMD sub float32 64 failed");
simd_sub_float32_128(a_f32, b_f32, result_f32); simd_sub_float32_128_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 128 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 128 failed");
} }
simd_sub_float32_256(a_f32, b_f32, result_f32); simd_sub_float32_256_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 256 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 256 failed");
} }
simd_sub_float32_512(a_f32, b_f32, result_f32); simd_sub_float32_512_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 512 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 512 failed");
} }
@ -674,17 +674,17 @@ void test_simd() {
simd_mul_float32_64(a_f32, b_f32, result_f32); simd_mul_float32_64(a_f32, b_f32, result_f32);
assert(floats_roughly_match(result_f32[0], a_f32[0]*b_f32[0]), "SIMD mul float32 64 failed"); assert(floats_roughly_match(result_f32[0], a_f32[0]*b_f32[0]), "SIMD mul float32 64 failed");
simd_mul_float32_128(a_f32, b_f32, result_f32); simd_mul_float32_128_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 128 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 128 failed");
} }
simd_mul_float32_256(a_f32, b_f32, result_f32); simd_mul_float32_256_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 256 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 256 failed");
} }
simd_mul_float32_512(a_f32, b_f32, result_f32); simd_mul_float32_512_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 512 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 512 failed");
} }
@ -693,65 +693,65 @@ void test_simd() {
simd_div_float32_64(a_f32, b_f32, result_f32); simd_div_float32_64(a_f32, b_f32, result_f32);
assert(floats_roughly_match(result_f32[0], a_f32[0]/b_f32[0]), "SIMD div float32 64 failed"); assert(floats_roughly_match(result_f32[0], a_f32[0]/b_f32[0]), "SIMD div float32 64 failed");
simd_div_float32_128(a_f32, b_f32, result_f32); simd_div_float32_128_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 128 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 128 failed");
} }
simd_div_float32_256(a_f32, b_f32, result_f32); simd_div_float32_256_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 256 failed, %.5f, %.5f, %.5f", result_f32[i], a_f32[i], a_f32[i] / b_f32[i]); assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 256 failed, %.5f, %.5f, %.5f", result_f32[i], a_f32[i], a_f32[i] / b_f32[i]);
} }
simd_div_float32_512(a_f32, b_f32, result_f32); simd_div_float32_512_aligned(a_f32, b_f32, result_f32);
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 512 failed"); assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 512 failed");
} }
// Test int32 add // Test int32 add
simd_add_int32_128(a_i32, b_i32, result_i32); simd_add_int32_128_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 128 failed"); assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 128 failed");
} }
simd_add_int32_256(a_i32, b_i32, result_i32); simd_add_int32_256_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 256 failed"); assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 256 failed");
} }
simd_add_int32_512(a_i32, b_i32, result_i32); simd_add_int32_512_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 512 failed"); assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 512 failed");
} }
// Test int32 subtract // Test int32 subtract
simd_sub_int32_128(a_i32, b_i32, result_i32); simd_sub_int32_128_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 128 failed"); assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 128 failed");
} }
simd_sub_int32_256(a_i32, b_i32, result_i32); simd_sub_int32_256_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 256 failed"); assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 256 failed");
} }
simd_sub_int32_512(a_i32, b_i32, result_i32); simd_sub_int32_512_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 512 failed"); assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 512 failed");
} }
// Test int32 multiply // Test int32 multiply
simd_mul_int32_128(a_i32, b_i32, result_i32); simd_mul_int32_128_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 128 failed"); assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 128 failed");
} }
simd_mul_int32_256(a_i32, b_i32, result_i32); simd_mul_int32_256_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 256 failed"); assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 256 failed");
} }
simd_mul_int32_512(a_i32, b_i32, result_i32); simd_mul_int32_512_aligned(a_i32, b_i32, result_i32);
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 512 failed"); assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 512 failed");
} }
@ -759,15 +759,17 @@ void test_simd() {
#define _TEST_NUM_SAMPLES ((100000 + 64) & ~(63)) #define _TEST_NUM_SAMPLES ((100000 + 64) & ~(63))
assert(_TEST_NUM_SAMPLES % 16 == 0); assert(_TEST_NUM_SAMPLES % 16 == 0);
float *samples_a = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)); float *samples_a = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)+512);
float *samples_b = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)); float *samples_b = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)+512);
samples_a = (float*)(((u64)samples_a+64)&~(63));
samples_b = (float*)(((u64)samples_b+64)&~(63));
memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float)); memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float));
memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float)); memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float));
u64 start = os_get_current_cycle_count(); u64 start = os_get_current_cycle_count();
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 16) { for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 16) {
simd_mul_float32_512(&samples_a[i], &samples_b[i], &samples_a[i]); simd_mul_float32_512_aligned(&samples_a[i], &samples_b[i], &samples_a[i]);
} }
u64 end = os_get_current_cycle_count(); u64 end = os_get_current_cycle_count();
@ -780,7 +782,7 @@ void test_simd() {
start = os_get_current_cycle_count(); start = os_get_current_cycle_count();
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 8) { for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 8) {
simd_mul_float32_256(&samples_a[i], &samples_b[i], &samples_a[i]); simd_mul_float32_256_aligned(&samples_a[i], &samples_b[i], &samples_a[i]);
} }
end = os_get_current_cycle_count(); end = os_get_current_cycle_count();
@ -793,7 +795,7 @@ void test_simd() {
start = os_get_current_cycle_count(); start = os_get_current_cycle_count();
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 4) { for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 4) {
simd_mul_float32_128(&samples_a[i], &samples_b[i], &samples_a[i]); simd_mul_float32_128_aligned(&samples_a[i], &samples_b[i], &samples_a[i]);
} }
end = os_get_current_cycle_count(); end = os_get_current_cycle_count();
@ -826,7 +828,8 @@ void test_simd() {
cycles = end-start; cycles = end-start;
print("NO SIMD float32 mul took %llu cycles\n", cycles); print("NO SIMD float32 mul took %llu cycles\n", cycles);
} }
void test_math_library() { // Indirect testing of some simd stuff
void test_linmath() {
// Test vector creation and access // Test vector creation and access
Vector2 v2_test = v2(1.0f, 2.0f); Vector2 v2_test = v2(1.0f, 2.0f);
@ -1005,6 +1008,15 @@ void test_math_library() {
Vector4 mixed_v4 = v4(4.0f, 8.0f, 12.0f, 16.0f); Vector4 mixed_v4 = v4(4.0f, 8.0f, 12.0f, 16.0f);
Vector4 mixed_v4_result = v4_mulf(mixed_v4, 0.25f); Vector4 mixed_v4_result = v4_mulf(mixed_v4, 0.25f);
assert(mixed_v4_result.x == 1.0f && mixed_v4_result.y == 2.0f && mixed_v4_result.z == 3.0f && mixed_v4_result.w == 4.0f, "Mixed Vector4 scalar multiplication failed"); assert(mixed_v4_result.x == 1.0f && mixed_v4_result.y == 2.0f && mixed_v4_result.z == 3.0f && mixed_v4_result.w == 4.0f, "Mixed Vector4 scalar multiplication failed");
float v2_dot = v2_dot_product(v2(2, 7), v2(3, 2));
float v3_dot = v3_dot_product(v3(2, 7, 2), v3(3, 2, 9));
float v4_dot = v4_dot_product(v4(2, 7, 6, 1), v4(3, 2, 1, 4));
assert(floats_roughly_match(v2_dot, 20), "Failed: v2_dot_product");
assert(floats_roughly_match(v3_dot, 38), "Failed: v3_dot_product");
assert(floats_roughly_match(v4_dot, 30), "Failed: v4_dot_product");
} }
void oogabooga_run_tests() { void oogabooga_run_tests() {
@ -1022,23 +1034,23 @@ void oogabooga_run_tests() {
print("OK!\n"); print("OK!\n");
print("Thread bombing allocator... "); //print("Thread bombing allocator... ");
Thread* threads[100]; //Thread* threads[100];
for (int i = 0; i < 100; i++) { //for (int i = 0; i < 100; i++) {
threads[i] = os_make_thread(test_allocator_threaded, get_heap_allocator()); // threads[i] = os_make_thread(test_allocator_threaded, get_heap_allocator());
os_start_thread(threads[i]); // os_start_thread(threads[i]);
} //}
for (int i = 0; i < 100; i++) { //for (int i = 0; i < 100; i++) {
os_join_thread(threads[i]); // os_join_thread(threads[i]);
} //}
print("OK!\n"); //print("OK!\n");
print("Testing file IO... "); print("Testing file IO... ");
test_file_io(); test_file_io();
print("OK!\n"); print("OK!\n");
print("Testing file IO... "); print("Testing linmath... ");
test_math_library(); test_linmath();
print("OK!\n"); print("OK!\n");
print("Testing simd... "); print("Testing simd... ");