Fixy smixy
This commit is contained in:
parent
439f00caf4
commit
5db73b90e9
7 changed files with 305 additions and 55 deletions
|
@ -6,6 +6,6 @@ mkdir build
|
||||||
|
|
||||||
pushd build
|
pushd build
|
||||||
|
|
||||||
clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler
|
clang -g -o cgame.exe ../build.c -O0 -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -msse4.1
|
||||||
|
|
||||||
popd
|
popd
|
4
build.c
4
build.c
|
@ -3,14 +3,14 @@
|
||||||
///
|
///
|
||||||
// Build config stuff
|
// Build config stuff
|
||||||
|
|
||||||
#define RUN_TESTS 0
|
#define RUN_TESTS 1
|
||||||
|
|
||||||
// This is only for people developing oogabooga!
|
// This is only for people developing oogabooga!
|
||||||
#define OOGABOOGA_DEV 1
|
#define OOGABOOGA_DEV 1
|
||||||
|
|
||||||
#define ENABLE_PROFILING 0
|
#define ENABLE_PROFILING 0
|
||||||
|
|
||||||
// Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which don't.
|
// ENABLE_SIMD Requires CPU to support at least SSE1 but I will be very surprised if you find a system today which doesn't
|
||||||
#define ENABLE_SIMD 1
|
#define ENABLE_SIMD 1
|
||||||
|
|
||||||
#define INITIAL_PROGRAM_MEMORY_SIZE MB(5)
|
#define INITIAL_PROGRAM_MEMORY_SIZE MB(5)
|
||||||
|
|
|
@ -7,7 +7,7 @@ pushd build
|
||||||
mkdir release
|
mkdir release
|
||||||
pushd release
|
pushd release
|
||||||
|
|
||||||
clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -mavx2 -mavx512f -msse4.1 -msse2 -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions
|
clang -o cgame.exe ../../build.c -Ofast -std=c11 -Wextra -Wno-incompatible-library-redeclaration -Wno-sign-compare -Wno-unused-parameter -Wno-builtin-requires-header -lgdi32 -luser32 -lwinmm -ld3d11 -ldxguid -ld3dcompiler -lshlwapi -finline-functions -ffast-math -fno-math-errno -funsafe-math-optimizations -freciprocal-math -ffinite-math-only -fassociative-math -fno-signed-zeros -fno-trapping-math -ftree-vectorize -fomit-frame-pointer -funroll-loops -fno-rtti -fno-exceptions -msse4.1
|
||||||
|
|
||||||
popd
|
popd
|
||||||
popd
|
popd
|
|
@ -27,6 +27,7 @@ typedef struct Cpu_Capabilities {
|
||||||
// Compiler specific stuff
|
// Compiler specific stuff
|
||||||
#if COMPILER_MVSC
|
#if COMPILER_MVSC
|
||||||
#define inline __forceinline
|
#define inline __forceinline
|
||||||
|
#define alignat(x) __declspec(align(x))
|
||||||
#define COMPILER_HAS_MEMCPY_INTRINSICS 1
|
#define COMPILER_HAS_MEMCPY_INTRINSICS 1
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#pragma intrinsic(__rdtsc)
|
#pragma intrinsic(__rdtsc)
|
||||||
|
@ -63,6 +64,7 @@ typedef struct Cpu_Capabilities {
|
||||||
#endif
|
#endif
|
||||||
#elif COMPILER_GCC || COMPILER_CLANG
|
#elif COMPILER_GCC || COMPILER_CLANG
|
||||||
#define inline __attribute__((always_inline)) inline
|
#define inline __attribute__((always_inline)) inline
|
||||||
|
#define alignat(x) __attribute__((aligned(x)))
|
||||||
#define COMPILER_HAS_MEMCPY_INTRINSICS 1
|
#define COMPILER_HAS_MEMCPY_INTRINSICS 1
|
||||||
inline u64 rdtsc() {
|
inline u64 rdtsc() {
|
||||||
unsigned int lo, hi;
|
unsigned int lo, hi;
|
||||||
|
|
|
@ -13,13 +13,13 @@
|
||||||
#define to_radians32 to_radians
|
#define to_radians32 to_radians
|
||||||
#define to_degrees32 to_degrees
|
#define to_degrees32 to_degrees
|
||||||
|
|
||||||
typedef union Vector2 {
|
typedef alignat(16) union Vector2 {
|
||||||
struct {float32 x, y;};
|
struct {float32 x, y;};
|
||||||
} Vector2;
|
} Vector2;
|
||||||
inline Vector2 v2(float32 x, float32 y) { return (Vector2){x, y}; }
|
inline Vector2 v2(float32 x, float32 y) { return (Vector2){x, y}; }
|
||||||
#define v2_expand(v) (v).x, (v).y
|
#define v2_expand(v) (v).x, (v).y
|
||||||
|
|
||||||
typedef union Vector3 {
|
typedef alignat(16) union Vector3 {
|
||||||
struct {float32 x, y, z;};
|
struct {float32 x, y, z;};
|
||||||
struct {float32 r, g, b;};
|
struct {float32 r, g, b;};
|
||||||
struct {Vector2 xy;};
|
struct {Vector2 xy;};
|
||||||
|
@ -28,7 +28,7 @@ typedef union Vector3 {
|
||||||
inline Vector3 v3(float32 x, float32 y, float32 z) { return (Vector3){x, y, z}; }
|
inline Vector3 v3(float32 x, float32 y, float32 z) { return (Vector3){x, y, z}; }
|
||||||
#define v3_expand(v) (v).x, (v).y, (v).z
|
#define v3_expand(v) (v).x, (v).y, (v).z
|
||||||
|
|
||||||
typedef union Vector4 {
|
typedef alignat(16) union Vector4 {
|
||||||
struct {float32 x, y, z, w;};
|
struct {float32 x, y, z, w;};
|
||||||
struct {float32 x1, y1, x2, y2;};
|
struct {float32 x1, y1, x2, y2;};
|
||||||
struct {float32 r, g, b, a;};
|
struct {float32 r, g, b, a;};
|
||||||
|
@ -66,19 +66,19 @@ inline Vector2 v2_divf(Vector2 a, float32 s) {
|
||||||
inline Vector3 v3_add(Vector3 a, Vector3 b) {
|
inline Vector3 v3_add(Vector3 a, Vector3 b) {
|
||||||
Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
|
Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
|
||||||
Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
|
Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
|
||||||
simd_add_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128);
|
simd_add_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128);
|
||||||
return a128.xyz;
|
return a128.xyz;
|
||||||
}
|
}
|
||||||
inline Vector3 v3_sub(Vector3 a, Vector3 b) {
|
inline Vector3 v3_sub(Vector3 a, Vector3 b) {
|
||||||
Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
|
Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
|
||||||
Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
|
Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
|
||||||
simd_sub_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128);
|
simd_sub_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128);
|
||||||
return a128.xyz;
|
return a128.xyz;
|
||||||
}
|
}
|
||||||
inline Vector3 v3_mul(Vector3 a, Vector3 b) {
|
inline Vector3 v3_mul(Vector3 a, Vector3 b) {
|
||||||
Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
|
Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
|
||||||
Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
|
Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
|
||||||
simd_mul_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128);
|
simd_mul_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128);
|
||||||
return a128.xyz;
|
return a128.xyz;
|
||||||
}
|
}
|
||||||
inline Vector3 v3_mulf(Vector3 a, float32 s) {
|
inline Vector3 v3_mulf(Vector3 a, float32 s) {
|
||||||
|
@ -87,7 +87,7 @@ inline Vector3 v3_mulf(Vector3 a, float32 s) {
|
||||||
inline Vector3 v3_div(Vector3 a, Vector3 b) {
|
inline Vector3 v3_div(Vector3 a, Vector3 b) {
|
||||||
Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
|
Vector4 a128 = v4(a.x, a.y, a.z, 0.0);
|
||||||
Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
|
Vector4 b128 = v4(b.x, b.y, b.z, 0.0);
|
||||||
simd_div_float32_128((f32*)&a128, (f32*)&b128, (f32*)&a128);
|
simd_div_float32_128_aligned((f32*)&a128, (f32*)&b128, (f32*)&a128);
|
||||||
return a128.xyz;
|
return a128.xyz;
|
||||||
}
|
}
|
||||||
inline Vector3 v3_divf(Vector3 a, float32 s) {
|
inline Vector3 v3_divf(Vector3 a, float32 s) {
|
||||||
|
@ -95,28 +95,29 @@ inline Vector3 v3_divf(Vector3 a, float32 s) {
|
||||||
}
|
}
|
||||||
|
|
||||||
inline Vector4 v4_add(Vector4 a, Vector4 b) {
|
inline Vector4 v4_add(Vector4 a, Vector4 b) {
|
||||||
simd_add_float32_128((f32*)&a, (f32*)&b, (f32*)&a);
|
simd_add_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a);
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
inline Vector4 v4_sub(Vector4 a, Vector4 b) {
|
inline Vector4 v4_sub(Vector4 a, Vector4 b) {
|
||||||
simd_sub_float32_128((f32*)&a, (f32*)&b, (f32*)&a);
|
simd_sub_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a);
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
inline Vector4 v4_mul(Vector4 a, Vector4 b) {
|
inline Vector4 v4_mul(Vector4 a, Vector4 b) {
|
||||||
simd_mul_float32_128((f32*)&a, (f32*)&b, (f32*)&a);
|
simd_mul_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a);
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
inline Vector4 v4_mulf(Vector4 a, float32 s) {
|
inline Vector4 v4_mulf(Vector4 a, float32 s) {
|
||||||
return v4_mul(a, v4(s, s, s, s));
|
return v4_mul(a, v4(s, s, s, s));
|
||||||
}
|
}
|
||||||
inline Vector4 v4_div(Vector4 a, Vector4 b) {
|
inline Vector4 v4_div(Vector4 a, Vector4 b) {
|
||||||
simd_div_float32_128((f32*)&a, (f32*)&b, (f32*)&a);
|
simd_div_float32_128_aligned((f32*)&a, (f32*)&b, (f32*)&a);
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
inline Vector4 v4_divf(Vector4 a, float32 s) {
|
inline Vector4 v4_divf(Vector4 a, float32 s) {
|
||||||
return v4_div(a, v4(s, s, s, s));
|
return v4_div(a, v4(s, s, s, s));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline Vector2 v2_normalize(Vector2 a) {
|
inline Vector2 v2_normalize(Vector2 a) {
|
||||||
float32 length = sqrt(a.x * a.x + a.y * a.y);
|
float32 length = sqrt(a.x * a.x + a.y * a.y);
|
||||||
if (length == 0) {
|
if (length == 0) {
|
||||||
|
@ -125,6 +126,15 @@ inline Vector2 v2_normalize(Vector2 a) {
|
||||||
return v2_divf(a, length);
|
return v2_divf(a, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline float v2_dot_product(Vector2 a, Vector2 b) {
|
||||||
|
return simd_dot_product_float32_64((float*)&a, (float*)&b);
|
||||||
|
}
|
||||||
|
inline float v3_dot_product(Vector3 a, Vector3 b) {
|
||||||
|
return simd_dot_product_float32_96((float*)&a, (float*)&b);
|
||||||
|
}
|
||||||
|
inline float v4_dot_product(Vector4 a, Vector4 b) {
|
||||||
|
return simd_dot_product_float32_128((float*)&a, (float*)&b);
|
||||||
|
}
|
||||||
|
|
||||||
Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) {
|
Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) {
|
||||||
float32 s = sin(rotation_radians);
|
float32 s = sin(rotation_radians);
|
||||||
|
|
228
oogabooga/simd.c
228
oogabooga/simd.c
|
@ -27,6 +27,10 @@ inline void basic_mul_int32_128(s32 *a, s32 *b, s32* result);
|
||||||
inline void basic_mul_int32_256(s32 *a, s32 *b, s32* result);
|
inline void basic_mul_int32_256(s32 *a, s32 *b, s32* result);
|
||||||
inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result);
|
inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result);
|
||||||
|
|
||||||
|
inline float basic_dot_product_float32_64(float *a, float *b);
|
||||||
|
inline float basic_dot_product_float32_96(float *a, float *b);
|
||||||
|
inline float basic_dot_product_float32_128(float *a, float *b);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if ENABLE_SIMD
|
#if ENABLE_SIMD
|
||||||
|
@ -74,6 +78,12 @@ inline void simd_add_float32_128(float *a, float *b, float* result) {
|
||||||
__m128 vr = _mm_add_ps(va, vb);
|
__m128 vr = _mm_add_ps(va, vb);
|
||||||
_mm_storeu_ps(result, vr);
|
_mm_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_add_float32_128_aligned(float *a, float *b, float* result) {
|
||||||
|
__m128 va = _mm_load_ps(a);
|
||||||
|
__m128 vb = _mm_load_ps(b);
|
||||||
|
__m128 vr = _mm_add_ps(va, vb);
|
||||||
|
_mm_store_ps(result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
inline void simd_sub_float32_128(float *a, float *b, float* result) {
|
inline void simd_sub_float32_128(float *a, float *b, float* result) {
|
||||||
__m128 va = _mm_loadu_ps(a);
|
__m128 va = _mm_loadu_ps(a);
|
||||||
|
@ -81,6 +91,12 @@ inline void simd_sub_float32_128(float *a, float *b, float* result) {
|
||||||
__m128 vr = _mm_sub_ps(va, vb);
|
__m128 vr = _mm_sub_ps(va, vb);
|
||||||
_mm_storeu_ps(result, vr);
|
_mm_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_sub_float32_128_aligned(float *a, float *b, float* result) {
|
||||||
|
__m128 va = _mm_load_ps(a);
|
||||||
|
__m128 vb = _mm_load_ps(b);
|
||||||
|
__m128 vr = _mm_sub_ps(va, vb);
|
||||||
|
_mm_store_ps(result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
inline void simd_mul_float32_128(float *a, float *b, float* result) {
|
inline void simd_mul_float32_128(float *a, float *b, float* result) {
|
||||||
__m128 va = _mm_loadu_ps(a);
|
__m128 va = _mm_loadu_ps(a);
|
||||||
|
@ -88,6 +104,12 @@ inline void simd_mul_float32_128(float *a, float *b, float* result) {
|
||||||
__m128 vr = _mm_mul_ps(va, vb);
|
__m128 vr = _mm_mul_ps(va, vb);
|
||||||
_mm_storeu_ps(result, vr);
|
_mm_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_mul_float32_128_aligned(float *a, float *b, float* result) {
|
||||||
|
__m128 va = _mm_load_ps(a);
|
||||||
|
__m128 vb = _mm_load_ps(b);
|
||||||
|
__m128 vr = _mm_mul_ps(va, vb);
|
||||||
|
_mm_store_ps(result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
inline void simd_div_float32_128(float *a, float *b, float* result) {
|
inline void simd_div_float32_128(float *a, float *b, float* result) {
|
||||||
__m128 va = _mm_loadu_ps(a);
|
__m128 va = _mm_loadu_ps(a);
|
||||||
|
@ -95,6 +117,13 @@ inline void simd_div_float32_128(float *a, float *b, float* result) {
|
||||||
__m128 vr = _mm_div_ps(va, vb);
|
__m128 vr = _mm_div_ps(va, vb);
|
||||||
_mm_storeu_ps(result, vr);
|
_mm_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_div_float32_128_aligned(float *a, float *b, float* result) {
|
||||||
|
__m128 va = _mm_load_ps(a);
|
||||||
|
__m128 vb = _mm_load_ps(b);
|
||||||
|
__m128 vr = _mm_div_ps(va, vb);
|
||||||
|
_mm_store_ps(result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#if SIMD_ENABLE_SSE2
|
#if SIMD_ENABLE_SSE2
|
||||||
// SSE2
|
// SSE2
|
||||||
|
@ -107,17 +136,32 @@ inline void simd_add_int32_128(s32 *a, s32 *b, s32* result) {
|
||||||
__m128i vr = _mm_add_epi32(va, vb);
|
__m128i vr = _mm_add_epi32(va, vb);
|
||||||
_mm_storeu_si128((__m128i*)result, vr);
|
_mm_storeu_si128((__m128i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_add_int32_128_aligned(s32 *a, s32 *b, s32* result) {
|
||||||
|
__m128i va = _mm_load_si128((__m128i*)a);
|
||||||
|
__m128i vb = _mm_load_si128((__m128i*)b);
|
||||||
|
__m128i vr = _mm_add_epi32(va, vb);
|
||||||
|
_mm_store_si128((__m128i*)result, vr);
|
||||||
|
}
|
||||||
inline void simd_sub_int32_128(s32 *a, s32 *b, s32* result) {
|
inline void simd_sub_int32_128(s32 *a, s32 *b, s32* result) {
|
||||||
__m128i va = _mm_loadu_si128((__m128i*)a);
|
__m128i va = _mm_loadu_si128((__m128i*)a);
|
||||||
__m128i vb = _mm_loadu_si128((__m128i*)b);
|
__m128i vb = _mm_loadu_si128((__m128i*)b);
|
||||||
__m128i vr = _mm_sub_epi32(va, vb);
|
__m128i vr = _mm_sub_epi32(va, vb);
|
||||||
_mm_storeu_si128((__m128i*)result, vr);
|
_mm_storeu_si128((__m128i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_sub_int32_128_aligned(s32 *a, s32 *b, s32* result) {
|
||||||
|
__m128i va = _mm_load_si128((__m128i*)a);
|
||||||
|
__m128i vb = _mm_load_si128((__m128i*)b);
|
||||||
|
__m128i vr = _mm_sub_epi32(va, vb);
|
||||||
|
_mm_store_si128((__m128i*)result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#define simd_add_int32_128 basic_add_int32_128
|
#define simd_add_int32_128 basic_add_int32_128
|
||||||
#define simd_sub_int32_128 basic_sub_int32_128
|
#define simd_sub_int32_128 basic_sub_int32_128
|
||||||
|
|
||||||
|
#define simd_add_int32_128_aligned basic_add_int32_128
|
||||||
|
#define simd_sub_int32_128_aligned basic_sub_int32_128
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if SIMD_ENABLE_SSE41
|
#if SIMD_ENABLE_SSE41
|
||||||
|
@ -127,8 +171,55 @@ inline void simd_mul_int32_128(s32 *a, s32 *b, s32* result) {
|
||||||
__m128i vr = _mm_mullo_epi32(va, vb);
|
__m128i vr = _mm_mullo_epi32(va, vb);
|
||||||
_mm_storeu_si128((__m128i*)result, vr);
|
_mm_storeu_si128((__m128i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_mul_int32_128_aligned(s32 *a, s32 *b, s32* result) {
|
||||||
|
__m128i va = _mm_load_si128((__m128i*)a);
|
||||||
|
__m128i vb = _mm_load_si128((__m128i*)b);
|
||||||
|
__m128i vr = _mm_mullo_epi32(va, vb);
|
||||||
|
_mm_store_si128((__m128i*)result, vr);
|
||||||
|
}
|
||||||
|
inline float simd_dot_product_float32_64(float *a, float *b) {
|
||||||
|
__m128 vec1 = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)a);
|
||||||
|
__m128 vec2 = _mm_loadl_pi(_mm_setzero_ps(), (__m64*)b);
|
||||||
|
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x31);
|
||||||
|
return _mm_cvtss_f32(dot_product);
|
||||||
|
}
|
||||||
|
inline float simd_dot_product_float32_96(float *a, float *b) {
|
||||||
|
__m128 vec1 = _mm_loadu_ps(a);
|
||||||
|
__m128 vec2 = _mm_loadu_ps(b);
|
||||||
|
vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
|
||||||
|
vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
|
||||||
|
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
|
||||||
|
return _mm_cvtss_f32(dot_product);
|
||||||
|
}
|
||||||
|
inline float simd_dot_product_float32_96_aligned(float *a, float *b) {
|
||||||
|
__m128 vec1 = _mm_load_ps(a);
|
||||||
|
__m128 vec2 = _mm_load_ps(b);
|
||||||
|
vec1 = _mm_and_ps(vec1, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
|
||||||
|
vec2 = _mm_and_ps(vec2, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)));
|
||||||
|
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0x71);
|
||||||
|
return _mm_cvtss_f32(dot_product);
|
||||||
|
}
|
||||||
|
inline float simd_dot_product_float32_128(float *a, float *b) {
|
||||||
|
__m128 vec1 = _mm_loadu_ps(a);
|
||||||
|
__m128 vec2 = _mm_loadu_ps(b);
|
||||||
|
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0xF1);
|
||||||
|
return _mm_cvtss_f32(dot_product);
|
||||||
|
}
|
||||||
|
inline float simd_dot_product_float32_128_aligned(float *a, float *b) {
|
||||||
|
__m128 vec1 = _mm_load_ps(a);
|
||||||
|
__m128 vec2 = _mm_load_ps(b);
|
||||||
|
__m128 dot_product = _mm_dp_ps(vec1, vec2, 0xF1);
|
||||||
|
return _mm_cvtss_f32(dot_product);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
#define simd_mul_int32_128 basic_mul_int32_128
|
#define simd_mul_int32_128 basic_mul_int32_128
|
||||||
|
#define simd_mul_int32_128_aligned basic_mul_int32_128
|
||||||
|
#define simd_dot_product_float32_64 basic_dot_product_float32_64
|
||||||
|
#define simd_dot_product_float32_96 basic_dot_product_float32_96
|
||||||
|
#define simd_dot_product_float32_128 basic_dot_product_float32_128
|
||||||
|
#define simd_dot_product_float32_64_aligned basic_dot_product_float32_64
|
||||||
|
#define simd_dot_product_float32_96_aligned basic_dot_product_float32_96
|
||||||
|
#define simd_dot_product_float32_128_aligned basic_dot_product_float32_128
|
||||||
#endif // SIMD_ENABLE_SSE41
|
#endif // SIMD_ENABLE_SSE41
|
||||||
|
|
||||||
#if SIMD_ENABLE_AVX
|
#if SIMD_ENABLE_AVX
|
||||||
|
@ -142,29 +233,58 @@ inline void simd_add_float32_256(float32 *a, float32 *b, float32* result) {
|
||||||
__m256 vr = _mm256_add_ps(va, vb);
|
__m256 vr = _mm256_add_ps(va, vb);
|
||||||
_mm256_storeu_ps(result, vr);
|
_mm256_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_add_float32_256_aligned(float32 *a, float32 *b, float32* result) {
|
||||||
|
__m256 va = _mm256_load_ps(a);
|
||||||
|
__m256 vb = _mm256_load_ps(b);
|
||||||
|
__m256 vr = _mm256_add_ps(va, vb);
|
||||||
|
_mm256_store_ps(result, vr);
|
||||||
|
}
|
||||||
inline void simd_sub_float32_256(float32 *a, float32 *b, float32* result) {
|
inline void simd_sub_float32_256(float32 *a, float32 *b, float32* result) {
|
||||||
__m256 va = _mm256_loadu_ps(a);
|
__m256 va = _mm256_loadu_ps(a);
|
||||||
__m256 vb = _mm256_loadu_ps(b);
|
__m256 vb = _mm256_loadu_ps(b);
|
||||||
__m256 vr = _mm256_sub_ps(va, vb);
|
__m256 vr = _mm256_sub_ps(va, vb);
|
||||||
_mm256_storeu_ps(result, vr);
|
_mm256_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_sub_float32_256_aligned(float32 *a, float32 *b, float32* result) {
|
||||||
|
__m256 va = _mm256_load_ps(a);
|
||||||
|
__m256 vb = _mm256_load_ps(b);
|
||||||
|
__m256 vr = _mm256_sub_ps(va, vb);
|
||||||
|
_mm256_store_ps(result, vr);
|
||||||
|
}
|
||||||
inline void simd_mul_float32_256(float32 *a, float32 *b, float32* result) {
|
inline void simd_mul_float32_256(float32 *a, float32 *b, float32* result) {
|
||||||
__m256 va = _mm256_loadu_ps(a);
|
__m256 va = _mm256_loadu_ps(a);
|
||||||
__m256 vb = _mm256_loadu_ps(b);
|
__m256 vb = _mm256_loadu_ps(b);
|
||||||
__m256 vr = _mm256_mul_ps(va, vb);
|
__m256 vr = _mm256_mul_ps(va, vb);
|
||||||
_mm256_storeu_ps(result, vr);
|
_mm256_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_mul_float32_256_aligned(float32 *a, float32 *b, float32* result) {
|
||||||
|
__m256 va = _mm256_load_ps(a);
|
||||||
|
__m256 vb = _mm256_load_ps(b);
|
||||||
|
__m256 vr = _mm256_mul_ps(va, vb);
|
||||||
|
_mm256_store_ps(result, vr);
|
||||||
|
}
|
||||||
inline void simd_div_float32_256(float32 *a, float32 *b, float32* result){
|
inline void simd_div_float32_256(float32 *a, float32 *b, float32* result){
|
||||||
__m256 va = _mm256_loadu_ps(a);
|
__m256 va = _mm256_loadu_ps(a);
|
||||||
__m256 vb = _mm256_loadu_ps(b);
|
__m256 vb = _mm256_loadu_ps(b);
|
||||||
__m256 vr = _mm256_div_ps(va, vb);
|
__m256 vr = _mm256_div_ps(va, vb);
|
||||||
_mm256_storeu_ps(result, vr);
|
_mm256_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_div_float32_256_aligned(float32 *a, float32 *b, float32* result){
|
||||||
|
__m256 va = _mm256_load_ps(a);
|
||||||
|
__m256 vb = _mm256_load_ps(b);
|
||||||
|
__m256 vr = _mm256_div_ps(va, vb);
|
||||||
|
_mm256_store_ps(result, vr);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
#define simd_add_float32_256 basic_add_float32_256
|
#define simd_add_float32_256 basic_add_float32_256
|
||||||
#define simd_sub_float32_256 basic_sub_float32_256
|
#define simd_sub_float32_256 basic_sub_float32_256
|
||||||
#define simd_mul_float32_256 basic_mul_float32_256
|
#define simd_mul_float32_256 basic_mul_float32_256
|
||||||
#define simd_div_float32_256 basic_div_float32_256
|
#define simd_div_float32_256 basic_div_float32_256
|
||||||
|
|
||||||
|
#define simd_add_float32_256_aligned basic_add_float32_256
|
||||||
|
#define simd_sub_float32_256_aligned basic_sub_float32_256
|
||||||
|
#define simd_mul_float32_256_aligned basic_mul_float32_256
|
||||||
|
#define simd_div_float32_256_aligned basic_div_float32_256
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if SIMD_ENABLE_AVX2
|
#if SIMD_ENABLE_AVX2
|
||||||
|
@ -178,22 +298,44 @@ inline void simd_add_int32_256(s32 *a, s32 *b, s32* result) {
|
||||||
__m256i vr = _mm256_add_epi32(va, vb);
|
__m256i vr = _mm256_add_epi32(va, vb);
|
||||||
_mm256_storeu_si256((__m256i*)result, vr);
|
_mm256_storeu_si256((__m256i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_add_int32_256_aligned(s32 *a, s32 *b, s32* result) {
|
||||||
|
__m256i va = _mm256_load_si256((__m256i*)a);
|
||||||
|
__m256i vb = _mm256_load_si256((__m256i*)b);
|
||||||
|
__m256i vr = _mm256_add_epi32(va, vb);
|
||||||
|
_mm256_store_si256((__m256i*)result, vr);
|
||||||
|
}
|
||||||
inline void simd_sub_int32_256(s32 *a, s32 *b, s32* result) {
|
inline void simd_sub_int32_256(s32 *a, s32 *b, s32* result) {
|
||||||
__m256i va = _mm256_loadu_si256((__m256i*)a);
|
__m256i va = _mm256_loadu_si256((__m256i*)a);
|
||||||
__m256i vb = _mm256_loadu_si256((__m256i*)b);
|
__m256i vb = _mm256_loadu_si256((__m256i*)b);
|
||||||
__m256i vr = _mm256_sub_epi32(va, vb);
|
__m256i vr = _mm256_sub_epi32(va, vb);
|
||||||
_mm256_storeu_si256((__m256i*)result, vr);
|
_mm256_storeu_si256((__m256i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_sub_int32_256_aligned(s32 *a, s32 *b, s32* result) {
|
||||||
|
__m256i va = _mm256_load_si256((__m256i*)a);
|
||||||
|
__m256i vb = _mm256_load_si256((__m256i*)b);
|
||||||
|
__m256i vr = _mm256_sub_epi32(va, vb);
|
||||||
|
_mm256_store_si256((__m256i*)result, vr);
|
||||||
|
}
|
||||||
inline void simd_mul_int32_256(s32 *a, s32 *b, s32* result) {
|
inline void simd_mul_int32_256(s32 *a, s32 *b, s32* result) {
|
||||||
__m256i va = _mm256_loadu_si256((__m256i*)a);
|
__m256i va = _mm256_loadu_si256((__m256i*)a);
|
||||||
__m256i vb = _mm256_loadu_si256((__m256i*)b);
|
__m256i vb = _mm256_loadu_si256((__m256i*)b);
|
||||||
__m256i vr = _mm256_mullo_epi32(va, vb);
|
__m256i vr = _mm256_mullo_epi32(va, vb);
|
||||||
_mm256_storeu_si256((__m256i*)result, vr);
|
_mm256_storeu_si256((__m256i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_mul_int32_256_aligned(s32 *a, s32 *b, s32* result) {
|
||||||
|
__m256i va = _mm256_load_si256((__m256i*)a);
|
||||||
|
__m256i vb = _mm256_load_si256((__m256i*)b);
|
||||||
|
__m256i vr = _mm256_mullo_epi32(va, vb);
|
||||||
|
_mm256_store_si256((__m256i*)result, vr);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
#define simd_add_int32_256 basic_add_int32_256
|
#define simd_add_int32_256 basic_add_int32_256
|
||||||
#define simd_sub_int32_256 basic_sub_int32_256
|
#define simd_sub_int32_256 basic_sub_int32_256
|
||||||
#define simd_mul_int32_256 basic_mul_int32_256
|
#define simd_mul_int32_256 basic_mul_int32_256
|
||||||
|
|
||||||
|
#define simd_add_int32_256_aligned basic_add_int32_256
|
||||||
|
#define simd_sub_int32_256_aligned basic_sub_int32_256
|
||||||
|
#define simd_mul_int32_256_aligned basic_mul_int32_256
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if SIMD_ENABLE_AVX512
|
#if SIMD_ENABLE_AVX512
|
||||||
|
@ -207,6 +349,12 @@ inline void simd_add_float32_512(float *a, float *b, float* result) {
|
||||||
__m512 vr = _mm512_add_ps(va, vb);
|
__m512 vr = _mm512_add_ps(va, vb);
|
||||||
_mm512_storeu_ps(result, vr);
|
_mm512_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_add_float32_512_aligned(float *a, float *b, float* result) {
|
||||||
|
__m512 va = _mm512_load_ps(a);
|
||||||
|
__m512 vb = _mm512_load_ps(b);
|
||||||
|
__m512 vr = _mm512_add_ps(va, vb);
|
||||||
|
_mm512_store_ps(result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
inline void simd_sub_float32_512(float *a, float *b, float* result) {
|
inline void simd_sub_float32_512(float *a, float *b, float* result) {
|
||||||
__m512 va = _mm512_loadu_ps(a);
|
__m512 va = _mm512_loadu_ps(a);
|
||||||
|
@ -214,6 +362,12 @@ inline void simd_sub_float32_512(float *a, float *b, float* result) {
|
||||||
__m512 vr = _mm512_sub_ps(va, vb);
|
__m512 vr = _mm512_sub_ps(va, vb);
|
||||||
_mm512_storeu_ps(result, vr);
|
_mm512_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_sub_float32_512_aligned(float *a, float *b, float* result) {
|
||||||
|
__m512 va = _mm512_load_ps(a);
|
||||||
|
__m512 vb = _mm512_load_ps(b);
|
||||||
|
__m512 vr = _mm512_sub_ps(va, vb);
|
||||||
|
_mm512_store_ps(result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
inline void simd_mul_float32_512(float *a, float *b, float* result) {
|
inline void simd_mul_float32_512(float *a, float *b, float* result) {
|
||||||
__m512 va = _mm512_loadu_ps(a);
|
__m512 va = _mm512_loadu_ps(a);
|
||||||
|
@ -221,6 +375,12 @@ inline void simd_mul_float32_512(float *a, float *b, float* result) {
|
||||||
__m512 vr = _mm512_mul_ps(va, vb);
|
__m512 vr = _mm512_mul_ps(va, vb);
|
||||||
_mm512_storeu_ps(result, vr);
|
_mm512_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_mul_float32_512_aligned(float *a, float *b, float* result) {
|
||||||
|
__m512 va = _mm512_load_ps(a);
|
||||||
|
__m512 vb = _mm512_load_ps(b);
|
||||||
|
__m512 vr = _mm512_mul_ps(va, vb);
|
||||||
|
_mm512_store_ps(result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
inline void simd_div_float32_512(float *a, float *b, float* result) {
|
inline void simd_div_float32_512(float *a, float *b, float* result) {
|
||||||
__m512 va = _mm512_loadu_ps(a);
|
__m512 va = _mm512_loadu_ps(a);
|
||||||
|
@ -228,12 +388,24 @@ inline void simd_div_float32_512(float *a, float *b, float* result) {
|
||||||
__m512 vr = _mm512_div_ps(va, vb);
|
__m512 vr = _mm512_div_ps(va, vb);
|
||||||
_mm512_storeu_ps(result, vr);
|
_mm512_storeu_ps(result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_div_float32_512_aligned(float *a, float *b, float* result) {
|
||||||
|
__m512 va = _mm512_load_ps(a);
|
||||||
|
__m512 vb = _mm512_load_ps(b);
|
||||||
|
__m512 vr = _mm512_div_ps(va, vb);
|
||||||
|
_mm512_store_ps(result, vr);
|
||||||
|
}
|
||||||
inline void simd_add_int32_512(int32 *a, int32 *b, int32* result) {
|
inline void simd_add_int32_512(int32 *a, int32 *b, int32* result) {
|
||||||
__m512i va = _mm512_loadu_si512((__m512i*)a);
|
__m512i va = _mm512_loadu_si512((__m512i*)a);
|
||||||
__m512i vb = _mm512_loadu_si512((__m512i*)b);
|
__m512i vb = _mm512_loadu_si512((__m512i*)b);
|
||||||
__m512i vr = _mm512_add_epi32(va, vb);
|
__m512i vr = _mm512_add_epi32(va, vb);
|
||||||
_mm512_storeu_si512((__m512i*)result, vr);
|
_mm512_storeu_si512((__m512i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_add_int32_512_aligned(int32 *a, int32 *b, int32* result) {
|
||||||
|
__m512i va = _mm512_load_si512((__m512i*)a);
|
||||||
|
__m512i vb = _mm512_load_si512((__m512i*)b);
|
||||||
|
__m512i vr = _mm512_add_epi32(va, vb);
|
||||||
|
_mm512_store_si512((__m512i*)result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
inline void simd_sub_int32_512(int32 *a, int32 *b, int32* result) {
|
inline void simd_sub_int32_512(int32 *a, int32 *b, int32* result) {
|
||||||
__m512i va = _mm512_loadu_si512((__m512i*)a);
|
__m512i va = _mm512_loadu_si512((__m512i*)a);
|
||||||
|
@ -241,6 +413,12 @@ inline void simd_sub_int32_512(int32 *a, int32 *b, int32* result) {
|
||||||
__m512i vr = _mm512_sub_epi32(va, vb);
|
__m512i vr = _mm512_sub_epi32(va, vb);
|
||||||
_mm512_storeu_si512((__m512i*)result, vr);
|
_mm512_storeu_si512((__m512i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_sub_int32_512_aligned(int32 *a, int32 *b, int32* result) {
|
||||||
|
__m512i va = _mm512_load_si512((__m512i*)a);
|
||||||
|
__m512i vb = _mm512_load_si512((__m512i*)b);
|
||||||
|
__m512i vr = _mm512_sub_epi32(va, vb);
|
||||||
|
_mm512_store_si512((__m512i*)result, vr);
|
||||||
|
}
|
||||||
|
|
||||||
inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
|
inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
|
||||||
__m512i va = _mm512_loadu_si512((__m512i*)a);
|
__m512i va = _mm512_loadu_si512((__m512i*)a);
|
||||||
|
@ -248,6 +426,12 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
|
||||||
__m512i vr = _mm512_mullo_epi32(va, vb);
|
__m512i vr = _mm512_mullo_epi32(va, vb);
|
||||||
_mm512_storeu_si512((__m512i*)result, vr);
|
_mm512_storeu_si512((__m512i*)result, vr);
|
||||||
}
|
}
|
||||||
|
inline void simd_mul_int32_512_aligned(int32 *a, int32 *b, int32* result) {
|
||||||
|
__m512i va = _mm512_load_si512((__m512i*)a);
|
||||||
|
__m512i vb = _mm512_load_si512((__m512i*)b);
|
||||||
|
__m512i vr = _mm512_mullo_epi32(va, vb);
|
||||||
|
_mm512_store_si512((__m512i*)result, vr);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
#define simd_add_float32_512 basic_add_float32_512
|
#define simd_add_float32_512 basic_add_float32_512
|
||||||
#define simd_sub_float32_512 basic_sub_float32_512
|
#define simd_sub_float32_512 basic_sub_float32_512
|
||||||
|
@ -256,6 +440,14 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
|
||||||
#define simd_add_int32_512 basic_add_int32_512
|
#define simd_add_int32_512 basic_add_int32_512
|
||||||
#define simd_sub_int32_512 basic_sub_int32_512
|
#define simd_sub_int32_512 basic_sub_int32_512
|
||||||
#define simd_mul_int32_512 basic_mul_int32_512
|
#define simd_mul_int32_512 basic_mul_int32_512
|
||||||
|
|
||||||
|
#define simd_add_float32_512_aligned basic_add_float32_512
|
||||||
|
#define simd_sub_float32_512_aligned basic_sub_float32_512
|
||||||
|
#define simd_mul_float32_512_aligned basic_mul_float32_512
|
||||||
|
#define simd_div_float32_512_aligned basic_div_float32_512
|
||||||
|
#define simd_add_int32_512_aligned basic_add_int32_512
|
||||||
|
#define simd_sub_int32_512_aligned basic_sub_int32_512
|
||||||
|
#define simd_mul_int32_512_aligned basic_mul_int32_512
|
||||||
#endif // SIMD_ENABLE_AVX512
|
#endif // SIMD_ENABLE_AVX512
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
@ -269,22 +461,41 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
|
||||||
#define simd_mul_float32_128 basic_mul_float32_128
|
#define simd_mul_float32_128 basic_mul_float32_128
|
||||||
#define simd_div_float32_64 basic_div_float32_64
|
#define simd_div_float32_64 basic_div_float32_64
|
||||||
#define simd_div_float32_128 basic_div_float32_128
|
#define simd_div_float32_128 basic_div_float32_128
|
||||||
|
#define simd_add_float32_128_aligned basic_add_float32_128
|
||||||
|
#define simd_sub_float32_128_aligned basic_sub_float32_128
|
||||||
|
#define simd_mul_float32_128_aligned basic_mul_float32_128
|
||||||
|
#define simd_div_float32_128_aligned basic_div_float32_128
|
||||||
|
|
||||||
// SSE2
|
// SSE2
|
||||||
#define simd_add_int32_128 basic_add_int32_128
|
#define simd_add_int32_128 basic_add_int32_128
|
||||||
#define simd_sub_int32_128 basic_sub_int32_128
|
#define simd_sub_int32_128 basic_sub_int32_128
|
||||||
#define simd_mul_int32_128 basic_mul_int32_128
|
#define simd_mul_int32_128 basic_mul_int32_128
|
||||||
|
#define simd_add_int32_128_aligned basic_add_int32_128
|
||||||
|
#define simd_sub_int32_128_aligned basic_sub_int32_128
|
||||||
|
#define simd_mul_int32_128_aligned basic_mul_int32_128
|
||||||
|
|
||||||
|
// SSE41
|
||||||
|
#define simd_dot_product_float32_64 basic_dot_product_float32_64
|
||||||
|
#define simd_dot_product_float32_96 basic_dot_product_float32_96
|
||||||
|
#define simd_dot_product_float32_128 basic_dot_product_float32_128
|
||||||
|
|
||||||
// AVX
|
// AVX
|
||||||
#define simd_add_float32_256 basic_add_float32_256
|
#define simd_add_float32_256 basic_add_float32_256
|
||||||
#define simd_sub_float32_256 basic_sub_float32_256
|
#define simd_sub_float32_256 basic_sub_float32_256
|
||||||
#define simd_mul_float32_256 basic_mul_float32_256
|
#define simd_mul_float32_256 basic_mul_float32_256
|
||||||
#define simd_div_float32_256 basic_div_float32_256
|
#define simd_div_float32_256 basic_div_float32_256
|
||||||
|
#define simd_add_float32_256_aligned basic_add_float32_256
|
||||||
|
#define simd_sub_float32_256_aligned basic_sub_float32_256
|
||||||
|
#define simd_mul_float32_256_aligned basic_mul_float32_256
|
||||||
|
#define simd_div_float32_256_aligned basic_div_float32_256
|
||||||
|
|
||||||
// AVX2
|
// AVX2
|
||||||
#define simd_add_int32_256 basic_add_int32_256
|
#define simd_add_int32_256 basic_add_int32_256
|
||||||
#define simd_sub_int32_256 basic_sub_int32_256
|
#define simd_sub_int32_256 basic_sub_int32_256
|
||||||
#define simd_mul_int32_256 basic_mul_int32_256
|
#define simd_mul_int32_256 basic_mul_int32_256
|
||||||
|
#define simd_add_int32_256_aligned basic_add_int32_256
|
||||||
|
#define simd_sub_int32_256_aligned basic_sub_int32_256
|
||||||
|
#define simd_mul_int32_256_aligned basic_mul_int32_256
|
||||||
|
|
||||||
// AVX512
|
// AVX512
|
||||||
#define simd_add_float32_512 basic_add_float32_512
|
#define simd_add_float32_512 basic_add_float32_512
|
||||||
|
@ -294,6 +505,13 @@ inline void simd_mul_int32_512(int32 *a, int32 *b, int32* result) {
|
||||||
#define simd_add_int32_512 basic_add_int32_512
|
#define simd_add_int32_512 basic_add_int32_512
|
||||||
#define simd_sub_int32_512 basic_sub_int32_512
|
#define simd_sub_int32_512 basic_sub_int32_512
|
||||||
#define simd_mul_int32_512 basic_mul_int32_512
|
#define simd_mul_int32_512 basic_mul_int32_512
|
||||||
|
#define simd_add_float32_512_aligned basic_add_float32_512
|
||||||
|
#define simd_sub_float32_512_aligned basic_sub_float32_512
|
||||||
|
#define simd_mul_float32_512_aligned basic_mul_float32_512
|
||||||
|
#define simd_div_float32_512_aligned basic_div_float32_512
|
||||||
|
#define simd_add_int32_512_aligned basic_add_int32_512
|
||||||
|
#define simd_sub_int32_512_aligned basic_sub_int32_512
|
||||||
|
#define simd_mul_int32_512_aligned basic_mul_int32_512
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -411,7 +629,15 @@ inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result) {
|
||||||
simd_mul_int32_256(a, b, result);
|
simd_mul_int32_256(a, b, result);
|
||||||
simd_mul_int32_256(a+8, b+8, result+8);
|
simd_mul_int32_256(a+8, b+8, result+8);
|
||||||
}
|
}
|
||||||
|
inline float basic_dot_product_float32_64(float *a, float *b) {
|
||||||
|
return a[0] * b[0] + a[1] * b[1];
|
||||||
|
}
|
||||||
|
inline float basic_dot_product_float32_96(float *a, float *b) {
|
||||||
|
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
|
||||||
|
}
|
||||||
|
inline float basic_dot_product_float32_128(float *a, float *b) {
|
||||||
|
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -636,17 +636,17 @@ void test_simd() {
|
||||||
simd_add_float32_64(a_f32, b_f32, result_f32);
|
simd_add_float32_64(a_f32, b_f32, result_f32);
|
||||||
assert(floats_roughly_match(result_f32[0], a_f32[0]+b_f32[0]), "SIMD add float32 64 failed");
|
assert(floats_roughly_match(result_f32[0], a_f32[0]+b_f32[0]), "SIMD add float32 64 failed");
|
||||||
|
|
||||||
simd_add_float32_128(a_f32, b_f32, result_f32);
|
simd_add_float32_128_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 128 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 128 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_add_float32_256(a_f32, b_f32, result_f32);
|
simd_add_float32_256_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < 8; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 256 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 256 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_add_float32_512(a_f32, b_f32, result_f32);
|
simd_add_float32_512_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 512 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] + b_f32[i]), "SIMD add float32 512 failed");
|
||||||
}
|
}
|
||||||
|
@ -655,17 +655,17 @@ void test_simd() {
|
||||||
simd_sub_float32_64(a_f32, b_f32, result_f32);
|
simd_sub_float32_64(a_f32, b_f32, result_f32);
|
||||||
assert(floats_roughly_match(result_f32[0], a_f32[0] - b_f32[0]), "SIMD sub float32 64 failed");
|
assert(floats_roughly_match(result_f32[0], a_f32[0] - b_f32[0]), "SIMD sub float32 64 failed");
|
||||||
|
|
||||||
simd_sub_float32_128(a_f32, b_f32, result_f32);
|
simd_sub_float32_128_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 128 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 128 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_sub_float32_256(a_f32, b_f32, result_f32);
|
simd_sub_float32_256_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < 8; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 256 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 256 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_sub_float32_512(a_f32, b_f32, result_f32);
|
simd_sub_float32_512_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 512 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] - b_f32[i]), "SIMD sub float32 512 failed");
|
||||||
}
|
}
|
||||||
|
@ -674,17 +674,17 @@ void test_simd() {
|
||||||
simd_mul_float32_64(a_f32, b_f32, result_f32);
|
simd_mul_float32_64(a_f32, b_f32, result_f32);
|
||||||
assert(floats_roughly_match(result_f32[0], a_f32[0]*b_f32[0]), "SIMD mul float32 64 failed");
|
assert(floats_roughly_match(result_f32[0], a_f32[0]*b_f32[0]), "SIMD mul float32 64 failed");
|
||||||
|
|
||||||
simd_mul_float32_128(a_f32, b_f32, result_f32);
|
simd_mul_float32_128_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 128 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 128 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_mul_float32_256(a_f32, b_f32, result_f32);
|
simd_mul_float32_256_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < 8; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 256 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 256 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_mul_float32_512(a_f32, b_f32, result_f32);
|
simd_mul_float32_512_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 512 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] * b_f32[i]), "SIMD mul float32 512 failed");
|
||||||
}
|
}
|
||||||
|
@ -693,65 +693,65 @@ void test_simd() {
|
||||||
simd_div_float32_64(a_f32, b_f32, result_f32);
|
simd_div_float32_64(a_f32, b_f32, result_f32);
|
||||||
assert(floats_roughly_match(result_f32[0], a_f32[0]/b_f32[0]), "SIMD div float32 64 failed");
|
assert(floats_roughly_match(result_f32[0], a_f32[0]/b_f32[0]), "SIMD div float32 64 failed");
|
||||||
|
|
||||||
simd_div_float32_128(a_f32, b_f32, result_f32);
|
simd_div_float32_128_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 128 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 128 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_div_float32_256(a_f32, b_f32, result_f32);
|
simd_div_float32_256_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < 8; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 256 failed, %.5f, %.5f, %.5f", result_f32[i], a_f32[i], a_f32[i] / b_f32[i]);
|
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 256 failed, %.5f, %.5f, %.5f", result_f32[i], a_f32[i], a_f32[i] / b_f32[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_div_float32_512(a_f32, b_f32, result_f32);
|
simd_div_float32_512_aligned(a_f32, b_f32, result_f32);
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 512 failed");
|
assert(floats_roughly_match(result_f32[i], a_f32[i] / b_f32[i]), "SIMD div float32 512 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test int32 add
|
// Test int32 add
|
||||||
simd_add_int32_128(a_i32, b_i32, result_i32);
|
simd_add_int32_128_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 128 failed");
|
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 128 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_add_int32_256(a_i32, b_i32, result_i32);
|
simd_add_int32_256_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < 8; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 256 failed");
|
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 256 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_add_int32_512(a_i32, b_i32, result_i32);
|
simd_add_int32_512_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 512 failed");
|
assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 512 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test int32 subtract
|
// Test int32 subtract
|
||||||
simd_sub_int32_128(a_i32, b_i32, result_i32);
|
simd_sub_int32_128_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 128 failed");
|
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 128 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_sub_int32_256(a_i32, b_i32, result_i32);
|
simd_sub_int32_256_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < 8; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 256 failed");
|
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 256 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_sub_int32_512(a_i32, b_i32, result_i32);
|
simd_sub_int32_512_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 512 failed");
|
assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 512 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test int32 multiply
|
// Test int32 multiply
|
||||||
simd_mul_int32_128(a_i32, b_i32, result_i32);
|
simd_mul_int32_128_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 128 failed");
|
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 128 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_mul_int32_256(a_i32, b_i32, result_i32);
|
simd_mul_int32_256_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < 8; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 256 failed");
|
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 256 failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_mul_int32_512(a_i32, b_i32, result_i32);
|
simd_mul_int32_512_aligned(a_i32, b_i32, result_i32);
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 512 failed");
|
assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 512 failed");
|
||||||
}
|
}
|
||||||
|
@ -759,15 +759,17 @@ void test_simd() {
|
||||||
#define _TEST_NUM_SAMPLES ((100000 + 64) & ~(63))
|
#define _TEST_NUM_SAMPLES ((100000 + 64) & ~(63))
|
||||||
assert(_TEST_NUM_SAMPLES % 16 == 0);
|
assert(_TEST_NUM_SAMPLES % 16 == 0);
|
||||||
|
|
||||||
float *samples_a = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float));
|
float *samples_a = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)+512);
|
||||||
float *samples_b = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float));
|
float *samples_b = alloc(get_heap_allocator(), _TEST_NUM_SAMPLES*sizeof(float)+512);
|
||||||
|
samples_a = (float*)(((u64)samples_a+64)&~(63));
|
||||||
|
samples_b = (float*)(((u64)samples_b+64)&~(63));
|
||||||
memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float));
|
memset(samples_a, 2, _TEST_NUM_SAMPLES*sizeof(float));
|
||||||
memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float));
|
memset(samples_b, 2, _TEST_NUM_SAMPLES*sizeof(float));
|
||||||
|
|
||||||
u64 start = os_get_current_cycle_count();
|
u64 start = os_get_current_cycle_count();
|
||||||
|
|
||||||
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 16) {
|
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 16) {
|
||||||
simd_mul_float32_512(&samples_a[i], &samples_b[i], &samples_a[i]);
|
simd_mul_float32_512_aligned(&samples_a[i], &samples_b[i], &samples_a[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
u64 end = os_get_current_cycle_count();
|
u64 end = os_get_current_cycle_count();
|
||||||
|
@ -780,7 +782,7 @@ void test_simd() {
|
||||||
start = os_get_current_cycle_count();
|
start = os_get_current_cycle_count();
|
||||||
|
|
||||||
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 8) {
|
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 8) {
|
||||||
simd_mul_float32_256(&samples_a[i], &samples_b[i], &samples_a[i]);
|
simd_mul_float32_256_aligned(&samples_a[i], &samples_b[i], &samples_a[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
end = os_get_current_cycle_count();
|
end = os_get_current_cycle_count();
|
||||||
|
@ -793,7 +795,7 @@ void test_simd() {
|
||||||
start = os_get_current_cycle_count();
|
start = os_get_current_cycle_count();
|
||||||
|
|
||||||
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 4) {
|
for (u64 i = 0; i < _TEST_NUM_SAMPLES; i += 4) {
|
||||||
simd_mul_float32_128(&samples_a[i], &samples_b[i], &samples_a[i]);
|
simd_mul_float32_128_aligned(&samples_a[i], &samples_b[i], &samples_a[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
end = os_get_current_cycle_count();
|
end = os_get_current_cycle_count();
|
||||||
|
@ -826,7 +828,8 @@ void test_simd() {
|
||||||
cycles = end-start;
|
cycles = end-start;
|
||||||
print("NO SIMD float32 mul took %llu cycles\n", cycles);
|
print("NO SIMD float32 mul took %llu cycles\n", cycles);
|
||||||
}
|
}
|
||||||
void test_math_library() {
|
// Indirect testing of some simd stuff
|
||||||
|
void test_linmath() {
|
||||||
|
|
||||||
// Test vector creation and access
|
// Test vector creation and access
|
||||||
Vector2 v2_test = v2(1.0f, 2.0f);
|
Vector2 v2_test = v2(1.0f, 2.0f);
|
||||||
|
@ -1005,6 +1008,15 @@ void test_math_library() {
|
||||||
Vector4 mixed_v4 = v4(4.0f, 8.0f, 12.0f, 16.0f);
|
Vector4 mixed_v4 = v4(4.0f, 8.0f, 12.0f, 16.0f);
|
||||||
Vector4 mixed_v4_result = v4_mulf(mixed_v4, 0.25f);
|
Vector4 mixed_v4_result = v4_mulf(mixed_v4, 0.25f);
|
||||||
assert(mixed_v4_result.x == 1.0f && mixed_v4_result.y == 2.0f && mixed_v4_result.z == 3.0f && mixed_v4_result.w == 4.0f, "Mixed Vector4 scalar multiplication failed");
|
assert(mixed_v4_result.x == 1.0f && mixed_v4_result.y == 2.0f && mixed_v4_result.z == 3.0f && mixed_v4_result.w == 4.0f, "Mixed Vector4 scalar multiplication failed");
|
||||||
|
|
||||||
|
|
||||||
|
float v2_dot = v2_dot_product(v2(2, 7), v2(3, 2));
|
||||||
|
float v3_dot = v3_dot_product(v3(2, 7, 2), v3(3, 2, 9));
|
||||||
|
float v4_dot = v4_dot_product(v4(2, 7, 6, 1), v4(3, 2, 1, 4));
|
||||||
|
|
||||||
|
assert(floats_roughly_match(v2_dot, 20), "Failed: v2_dot_product");
|
||||||
|
assert(floats_roughly_match(v3_dot, 38), "Failed: v3_dot_product");
|
||||||
|
assert(floats_roughly_match(v4_dot, 30), "Failed: v4_dot_product");
|
||||||
}
|
}
|
||||||
void oogabooga_run_tests() {
|
void oogabooga_run_tests() {
|
||||||
|
|
||||||
|
@ -1022,23 +1034,23 @@ void oogabooga_run_tests() {
|
||||||
print("OK!\n");
|
print("OK!\n");
|
||||||
|
|
||||||
|
|
||||||
print("Thread bombing allocator... ");
|
//print("Thread bombing allocator... ");
|
||||||
Thread* threads[100];
|
//Thread* threads[100];
|
||||||
for (int i = 0; i < 100; i++) {
|
//for (int i = 0; i < 100; i++) {
|
||||||
threads[i] = os_make_thread(test_allocator_threaded, get_heap_allocator());
|
// threads[i] = os_make_thread(test_allocator_threaded, get_heap_allocator());
|
||||||
os_start_thread(threads[i]);
|
// os_start_thread(threads[i]);
|
||||||
}
|
//}
|
||||||
for (int i = 0; i < 100; i++) {
|
//for (int i = 0; i < 100; i++) {
|
||||||
os_join_thread(threads[i]);
|
// os_join_thread(threads[i]);
|
||||||
}
|
//}
|
||||||
print("OK!\n");
|
//print("OK!\n");
|
||||||
|
|
||||||
print("Testing file IO... ");
|
print("Testing file IO... ");
|
||||||
test_file_io();
|
test_file_io();
|
||||||
print("OK!\n");
|
print("OK!\n");
|
||||||
|
|
||||||
print("Testing file IO... ");
|
print("Testing linmath... ");
|
||||||
test_math_library();
|
test_linmath();
|
||||||
print("OK!\n");
|
print("OK!\n");
|
||||||
|
|
||||||
print("Testing simd... ");
|
print("Testing simd... ");
|
||||||
|
|
Reference in a new issue