diff --git a/oogabooga/base.c b/oogabooga/base.c index b9bc88a..24cb9d8 100644 --- a/oogabooga/base.c +++ b/oogabooga/base.c @@ -60,86 +60,27 @@ void printf(const char* fmt, ...); #define ZERO(t) (t){0} -/// -// Compiler specific stuff -// We make inline actually inline. -#ifdef _MSC_VER - // Microsoft Visual C++ - #define inline __forceinline + +#ifdef __clang__ + // Clang/LLVM + #define inline __attribute__((always_inline)) inline #define COMPILER_HAS_MEMCPY_INTRINSICS 1 - #include - #pragma intrinsic(__rdtsc) - inline u64 rdtsc() { - return __rdtsc(); - } + #define COMPILER_CLANG 1 #elif defined(__GNUC__) || defined(__GNUG__) // GNU GCC/G++ #define inline __attribute__((always_inline)) inline #define COMPILER_HAS_MEMCPY_INTRINSICS 1 - inline u64 rdtsc() { - unsigned int lo, hi; - __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); - return ((u64)hi << 32) | lo; - } -#elif defined(__clang__) - // Clang/LLVM - #define inline __attribute__((always_inline)) inline - #define COMPILER_HAS_MEMCPY_INTRINSICS 1 - inline u64 rdtsc() { - unsigned int lo, hi; - __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); - return ((u64)hi << 32) | lo; - } -#elif defined(__INTEL_COMPILER) || defined(__ICC) - // Intel C++ Compiler + #define COMPILER_GCC 1 +#elif defined(_MSC_VER) + // Microsoft Visual C++ #define inline __forceinline #define COMPILER_HAS_MEMCPY_INTRINSICS 1 - inline u64 rdtsc() { - return __rdtsc(); - } -#elif defined(__BORLANDC__) - // Borland C++ - #define inline __inline - inline u64 rdtsc() { - unsigned int lo, hi; - __asm { - rdtsc - mov lo, eax - mov hi, edx - } - return ((u64)hi << 32) | lo; - } -#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) - // Oracle Solaris Studio - #define inline inline __attribute__((always_inline)) - inline u64 rdtsc() { - unsigned int lo, hi; - asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); - return ((u64)hi << 32) | lo; - } -#elif defined(__IBMC__) || defined(__IBMCPP__) - // IBM XL C/C++ Compiler - #define inline __attribute__((always_inline)) inline - #define COMPILER_HAS_MEMCPY_INTRINSICS 1 - inline u64 rdtsc() { - unsigned int lo, hi; - __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); - return ((u64)hi << 32) | lo; - } -#elif defined(__PGI) - // Portland Group Compiler - #define inline inline __attribute__((always_inline)) - inline u64 rdtsc() { - unsigned int lo, hi; - asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); - return ((u64)hi << 32) | lo; - } + #define COMPILER_MSVC 1 #else - // Fallback for unknown compilers #define inline inline + #define COMPILER_HAS_MEMCPY_INTRINSICS 0 #endif - #define FIRST_ARG(arg1, ...) arg1 #define SECOND_ARG(arg1, arg2, ...) arg2 #define print(...) _Generic((FIRST_ARG(__VA_ARGS__)), \ diff --git a/oogabooga/cpu.c b/oogabooga/cpu.c new file mode 100644 index 0000000..7e70070 --- /dev/null +++ b/oogabooga/cpu.c @@ -0,0 +1,1599 @@ +// #Portability rip ARM +typedef struct Cpu_Info_X86 { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +} Cpu_Info_X86; + +typedef struct Cpu_Capabilities { + bool sse1; + bool sse2; + bool sse3; + bool ssse3; + bool sse41; + bool sse42; + bool avx; + bool avx2; + bool avx512; + +} Cpu_Capabilities; + +/// +// Compiler specific stuff +#if COMPILER_MVSC + #include + #pragma intrinsic(__rdtsc) + inline u64 rdtsc() { + return __rdtsc(); + } + inline Cpu_Info_X86 cpuid(u32 function_id) { + Cpu_Info_X86 i; + __cpuid((int*)&i, function_id); + return i; + } +#elif COMPILER_GCC + inline u64 rdtsc() { + unsigned int lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((u64)hi << 32) | lo; + } + Cpu_Info_X86 info; + __asm__ __volatile__( + "cpuid" + : "=a"(info.eax), "=b"(info.ebx), "=c"(info.ecx), "=d"(info.edx) + : "a"(function_id), "c"(0)); + return info; + } +#elif COMPILER_CLANG + #include + inline u64 rdtsc() { + unsigned int lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((u64)hi << 32) | lo; + } + inline Cpu_Info_X86 cpuid(u32 function_id) { + Cpu_Info_X86 info; + __asm__ __volatile__( + "cpuid" + : "=a"(info.eax), "=b"(info.ebx), "=c"(info.ecx), "=d"(info.edx) + : "a"(function_id), "c"(0)); + return info; + } +#else + inline u64 rdtsc() { return 0; } + inline Cpu_Info_X86 cpuid(u32 function_id) {return (Cpu_Info_X86){0};} +#endif + + +Cpu_Capabilities query_cpu_capabilities() { + Cpu_Capabilities result = {0}; + + Cpu_Info_X86 info = cpuid(1); + + result.sse1 = (info.edx & (1 << 25)) != 0; + result.sse2 = (info.edx & (1 << 26)) != 0; + result.sse3 = (info.ecx & (1 << 0)) != 0; + result.ssse3 = (info.ecx & (1 << 9)) != 0; + result.sse41 = (info.ecx & (1 << 19)) != 0; + result.sse42 = (info.ecx & (1 << 20)) != 0; + + result.avx = (info.ecx & (1 << 28)) != 0; + + Cpu_Info_X86 ext_info = cpuid(7); + result.avx2 = (ext_info.ebx & (1 << 5)) != 0; + + result.avx512 = (ext_info.ebx & (1 << 16)) != 0; + + return result; +} + +void (*simd_add_float32_64 )(float32 *a, float32 *b, float32* result) = 0; +void (*simd_add_float32_128)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_add_float32_256)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_add_float32_512)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_sub_float32_64 )(float32 *a, float32 *b, float32* result) = 0; +void (*simd_sub_float32_128)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_sub_float32_256)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_sub_float32_512)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_mul_float32_64 )(float32 *a, float32 *b, float32* result) = 0; +void (*simd_mul_float32_128)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_mul_float32_256)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_mul_float32_512)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_div_float32_64 )(float32 *a, float32 *b, float32* result) = 0; +void (*simd_div_float32_128)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_div_float32_256)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_div_float32_512)(float32 *a, float32 *b, float32* result) = 0; +void (*simd_add_int32_128)(s32 *a, s32 *b, s32* result) = 0; +void (*simd_add_int32_256)(s32 *a, s32 *b, s32* result) = 0; +void (*simd_add_int32_512)(s32 *a, s32 *b, s32* result) = 0; +void (*simd_sub_int32_128)(s32 *a, s32 *b, s32* result) = 0; +void (*simd_sub_int32_256)(s32 *a, s32 *b, s32* result) = 0; +void (*simd_sub_int32_512)(s32 *a, s32 *b, s32* result) = 0; +void (*simd_mul_int32_128)(s32 *a, s32 *b, s32* result) = 0; +void (*simd_mul_int32_256)(s32 *a, s32 *b, s32* result) = 0; +void (*simd_mul_int32_512)(s32 *a, s32 *b, s32* result) = 0; + +inline void basic_add_float32_64(float32 *a, float32 *b, float32* result) { + result[0] = a[0] + b[0]; + result[1] = a[1] + b[1]; +} + +inline void basic_add_float32_128(float32 *a, float32 *b, float32* result) { + result[0] = a[0] + b[0]; + result[1] = a[1] + b[1]; + result[2] = a[2] + b[2]; + result[3] = a[3] + b[3]; +} + +inline void basic_add_float32_256(float32 *a, float32 *b, float32* result) { + simd_add_float32_128(a, b, result); + simd_add_float32_128(a+4, b+4, result+4); +} + +inline void basic_add_float32_512(float32 *a, float32 *b, float32* result) { + simd_add_float32_256(a, b, result); + simd_add_float32_256(a+8, b+8, result+8); +} + +inline void basic_sub_float32_64(float32 *a, float32 *b, float32* result) { + result[0] = a[0] - b[0]; + result[1] = a[1] - b[1]; +} + +inline void basic_sub_float32_128(float32 *a, float32 *b, float32* result) { + result[0] = a[0] - b[0]; + result[1] = a[1] - b[1]; + result[2] = a[2] - b[2]; + result[3] = a[3] - b[3]; +} + +inline void basic_sub_float32_256(float32 *a, float32 *b, float32* result) { + simd_sub_float32_128(a, b, result); + simd_sub_float32_128(a+4, b+4, result+4); +} + +inline void basic_sub_float32_512(float32 *a, float32 *b, float32* result) { + simd_sub_float32_256(a, b, result); + simd_sub_float32_256(a+8, b+8, result+8); +} + +inline void basic_mul_float32_64(float32 *a, float32 *b, float32* result) { + result[0] = a[0] * b[0]; + result[1] = a[1] * b[1]; +} + +inline void basic_mul_float32_128(float32 *a, float32 *b, float32* result) { + result[0] = a[0] * b[0]; + result[1] = a[1] * b[1]; + result[2] = a[2] * b[2]; + result[3] = a[3] * b[3]; +} + +inline void basic_mul_float32_256(float32 *a, float32 *b, float32* result) { + simd_mul_float32_128(a, b, result); + simd_mul_float32_128(a+4, b+4, result+4); +} + +inline void basic_mul_float32_512(float32 *a, float32 *b, float32* result) { + simd_mul_float32_256(a, b, result); + simd_mul_float32_256(a+8, b+8, result+8); +} + +inline void basic_div_float32_64(float32 *a, float32 *b, float32* result) { + result[0] = a[0] / b[0]; + result[1] = a[1] / b[1]; +} + +inline void basic_div_float32_128(float32 *a, float32 *b, float32* result) { + result[0] = a[0] / b[0]; + result[1] = a[1] / b[1]; + result[2] = a[2] / b[2]; + result[3] = a[3] / b[3]; +} + +inline void basic_div_float32_256(float32 *a, float32 *b, float32* result) { + simd_div_float32_128(a, b, result); + simd_div_float32_128(a+4, b+4, result+4); +} + +inline void basic_div_float32_512(float32 *a, float32 *b, float32* result) { + simd_div_float32_256(a, b, result); + simd_div_float32_256(a+8, b+8, result+8); +} + +inline void basic_add_int32_128(s32 *a, s32 *b, s32* result) { + result[0] = a[0] + b[0]; + result[1] = a[1] + b[1]; + result[2] = a[2] + b[2]; + result[3] = a[3] + b[3]; +} + +inline void basic_add_int32_256(s32 *a, s32 *b, s32* result) { + simd_add_int32_128(a, b, result); + simd_add_int32_128(a+4, b+4, result+4); +} + +inline void basic_add_int32_512(s32 *a, s32 *b, s32* result) { + simd_add_int32_256(a, b, result); + simd_add_int32_256(a+8, b+8, result+8); +} + +inline void basic_sub_int32_128(s32 *a, s32 *b, s32* result) { + result[0] = a[0] - b[0]; + result[1] = a[1] - b[1]; + result[2] = a[2] - b[2]; + result[3] = a[3] - b[3]; +} + +inline void basic_sub_int32_256(s32 *a, s32 *b, s32* result) { + simd_sub_int32_128(a, b, result); + simd_sub_int32_128(a+4, b+4, result+4); +} + +inline void basic_sub_int32_512(s32 *a, s32 *b, s32* result) { + simd_sub_int32_256(a, b, result); + simd_sub_int32_256(a+8, b+8, result+8); +} + +inline void basic_mul_int32_128(s32 *a, s32 *b, s32* result) { + result[0] = a[0] * b[0]; + result[1] = a[1] * b[1]; + result[2] = a[2] * b[2]; + result[3] = a[3] * b[3]; +} + +inline void basic_mul_int32_256(s32 *a, s32 *b, s32* result) { + simd_mul_int32_128(a, b, result); + simd_mul_int32_128(a+4, b+4, result+4); +} + +inline void basic_mul_int32_512(s32 *a, s32 *b, s32* result) { + simd_mul_int32_256(a, b, result); + simd_mul_int32_256(a+8, b+8, result+8); +} + +#if ENABLE_SIMD + +#if COMPILER_MSVC + // SSE 1 float32 + inline void sse1_add_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse1_add_float32_128(float32 *a, float32 *b, float32* result); + inline void sse1_sub_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse1_sub_float32_128(float32 *a, float32 *b, float32* result); + inline void sse1_mul_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse1_mul_float32_128(float32 *a, float32 *b, float32* result); + inline void sse1_div_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse1_div_float32_128(float32 *a, float32 *b, float32* result); + + // SSE 2 float32 + inline void sse2_add_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse2_add_float32_128(float32 *a, float32 *b, float32* result); + inline void sse2_sub_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse2_sub_float32_128(float32 *a, float32 *b, float32* result); + inline void sse2_mul_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse2_mul_float32_128(float32 *a, float32 *b, float32* result); + inline void sse2_div_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse2_div_float32_128(float32 *a, float32 *b, float32* result); + + // SSE 2 int32 + inline void sse2_add_int32_128(s32 *a, s32 *b, s32* result); + inline void sse2_sub_int32_128(s32 *a, s32 *b, s32* result); + inline void sse2_mul_int32_128(s32 *a, s32 *b, s32* result); + + // SSE 3 float32 + inline void sse3_add_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse3_add_float32_128(float32 *a, float32 *b, float32* result); + inline void sse3_sub_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse3_sub_float32_128(float32 *a, float32 *b, float32* result); + inline void sse3_mul_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse3_mul_float32_128(float32 *a, float32 *b, float32* result); + inline void sse3_div_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse3_div_float32_128(float32 *a, float32 *b, float32* result); + + // SSSE 3 float32 + inline void ssse3_add_float32_64 (float32 *a, float32 *b, float32* result); + inline void ssse3_add_float32_128(float32 *a, float32 *b, float32* result); + inline void ssse3_sub_float32_64 (float32 *a, float32 *b, float32* result); + inline void ssse3_sub_float32_128(float32 *a, float32 *b, float32* result); + inline void ssse3_mul_float32_64 (float32 *a, float32 *b, float32* result); + inline void ssse3_mul_float32_128(float32 *a, float32 *b, float32* result); + inline void ssse3_div_float32_64 (float32 *a, float32 *b, float32* result); + inline void ssse3_div_float32_128(float32 *a, float32 *b, float32* result); + + // SSSE 4.1 float32 + inline void sse41_add_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse41_add_float32_128(float32 *a, float32 *b, float32* result); + inline void sse41_sub_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse41_sub_float32_128(float32 *a, float32 *b, float32* result); + inline void sse41_mul_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse41_mul_float32_128(float32 *a, float32 *b, float32* result); + inline void sse41_div_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse41_div_float32_128(float32 *a, float32 *b, float32* result); + + // SSSE 4.2 float32 + inline void sse42_add_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse42_add_float32_128(float32 *a, float32 *b, float32* result); + inline void sse42_sub_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse42_sub_float32_128(float32 *a, float32 *b, float32* result); + inline void sse42_mul_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse42_mul_float32_128(float32 *a, float32 *b, float32* result); + inline void sse42_div_float32_64 (float32 *a, float32 *b, float32* result); + inline void sse42_div_float32_128(float32 *a, float32 *b, float32* result); + + // AVX 1/2 float32 + inline void avx_add_float32_256(float32 *a, float32 *b, float32* result); + inline void avx_sub_float32_256(float32 *a, float32 *b, float32* result); + inline void avx_mul_float32_256(float32 *a, float32 *b, float32* result); + inline void avx_div_float32_256(float32 *a, float32 *b, float32* result); + + // AVX2 2 int32 + inline void avx2_add_int32_256(s32 *a, s32 *b, s32* result); + inline void avx2_sub_int32_256(s32 *a, s32 *b, s32* result); + inline void avx2_mul_int32_256(s32 *a, s32 *b, s32* result); + + // AVX2 512 float32 + inline void avx512_add_float32_512(float32 *a, float32 *b, float32* result); + inline void avx512_sub_float32_512(float32 *a, float32 *b, float32* result); + inline void avx512_mul_float32_512(float32 *a, float32 *b, float32* result); + inline void avx512_div_float32_512(float32 *a, float32 *b, float32* result); + + // AVX2 512 float32 + inline void avx512_add_int32_512(s32 *a, s32 *b, s32* result); + inline void avx512_sub_int32_512(s32 *a, s32 *b, s32* result); + inline void avx512_mul_int32_512(s32 *a, s32 *b, s32* result); + +#elif COMPILER_GCC || COMPILER_CLANG + + // SSE 1 float32 +inline void sse1_add_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "addss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse1_add_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse1_sub_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "subss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse1_sub_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse1_mul_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "mulss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse1_mul_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse1_div_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "divss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse1_div_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +// SSE 2 float32 +inline void sse2_add_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "addss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse2_add_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse2_sub_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "subss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse2_sub_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse2_mul_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "mulss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse2_mul_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse2_div_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "divss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse2_div_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +// SSE 2 int32 +inline void sse2_add_int32_128(s32 *a, s32 *b, s32* result) { + __asm__ ( + "movdqa (%0), %%xmm0\n\t" + "movdqa (%1), %%xmm1\n\t" + "paddd %%xmm1, %%xmm0\n\t" + "movdqa %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse2_sub_int32_128(s32 *a, s32 *b, s32* result) { + __asm__ ( + "movdqa (%0), %%xmm0\n\t" + "movdqa (%1), %%xmm1\n\t" + "psubd %%xmm1, %%xmm0\n\t" + "movdqa %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse2_mul_int32_128(s32 *a, s32 *b, s32* result) { + __asm__ ( + "movdqa (%0), %%xmm0\n\t" + "movdqa (%1), %%xmm1\n\t" + "pmulld %%xmm1, %%xmm0\n\t" + "movdqa %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +// SSE 3 float32 +inline void sse3_add_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "addss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse3_add_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse3_sub_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "subss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse3_sub_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse3_mul_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "mulss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse3_mul_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse3_div_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "divss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse3_div_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +// SSSE 3 float32 +inline void ssse3_add_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "addss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void ssse3_add_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void ssse3_sub_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "subss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void ssse3_sub_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void ssse3_mul_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "mulss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void ssse3_mul_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void ssse3_div_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "divss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void ssse3_div_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +// SSE4.1 float32 +inline void sse41_add_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "addss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse41_add_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse41_sub_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "subss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse41_sub_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse41_mul_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "mulss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse41_mul_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse41_div_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "divss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse41_div_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +// SSE4.2 float32 +inline void sse42_add_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "addss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse42_add_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "addps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse42_sub_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "subss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse42_sub_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "subps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); + +} + +inline void sse42_mul_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "mulss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse42_mul_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "mulps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +inline void sse42_div_float32_64(float32 *a, float32 *b, float32* result) { + float a128[2]; + a128[0] = *a; + a128[1] = 0; + float b128[2]; + b128[0] = *b; + b128[1] = 0; + float r128[2]; + r128[0] = *result; + r128[1] = 0; + __asm__ ( + "movss (%0), %%xmm0\n\t" + "movss (%1), %%xmm1\n\t" + "divss %%xmm1, %%xmm0\n\t" + "movss %%xmm0, (%2)\n\t" + : + : "r" (a128), "r" (b128), "r" (r128) + : "xmm0", "xmm1" + ); + *result = r128[0]; +} + +inline void sse42_div_float32_128(float32 *a, float32 *b, float32* result) { + __asm__ ( + "movaps (%0), %%xmm0\n\t" + "movaps (%1), %%xmm1\n\t" + "divps %%xmm1, %%xmm0\n\t" + "movaps %%xmm0, (%2)\n\t" + : + : "r" (a), "r" (b), "r" (result) + : "xmm0", "xmm1" + ); +} + +// AVX float32 +inline void avx_add_float32_256(float32 *a, float32 *b, float32* result) { + __asm__ ( + "vmovups %1, %%ymm0\n\t" + "vmovups %2, %%ymm1\n\t" + "vaddps %%ymm1, %%ymm0, %%ymm0\n\t" + "vmovups %%ymm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "ymm0", "ymm1" + ); +} + +inline void avx_sub_float32_256(float32 *a, float32 *b, float32* result) { + __asm__ ( + "vmovups %1, %%ymm0\n\t" + "vmovups %2, %%ymm1\n\t" + "vsubps %%ymm1, %%ymm0, %%ymm0\n\t" + "vmovups %%ymm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "ymm0", "ymm1" + ); +} + +inline void avx_mul_float32_256(float32 *a, float32 *b, float32* result) { + __asm__ ( + "vmovups %1, %%ymm0\n\t" + "vmovups %2, %%ymm1\n\t" + "vmulps %%ymm1, %%ymm0, %%ymm0\n\t" + "vmovups %%ymm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "ymm0", "ymm1" + ); +} + +inline void avx_div_float32_256(float32 *a, float32 *b, float32* result) { + __asm__ ( + "vmovups %1, %%ymm0\n\t" + "vmovups %2, %%ymm1\n\t" + "vdivps %%ymm1, %%ymm0, %%ymm0\n\t" + "vmovups %%ymm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "ymm0", "ymm1" + ); +} + +// AVX2 int32 +inline void avx2_add_int32_256(s32 *a, s32 *b, s32* result) { + __asm__ ( + "vmovdqu %1, %%ymm0\n\t" + "vmovdqu %2, %%ymm1\n\t" + "vpaddd %%ymm1, %%ymm0, %%ymm0\n\t" + "vmovdqu %%ymm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "ymm0", "ymm1" + ); +} + +inline void avx2_sub_int32_256(s32 *a, s32 *b, s32* result) { + __asm__ ( + "vmovdqu %1, %%ymm0\n\t" + "vmovdqu %2, %%ymm1\n\t" + "vpsubd %%ymm1, %%ymm0, %%ymm0\n\t" + "vmovdqu %%ymm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "ymm0", "ymm1" + ); +} + +inline void avx2_mul_int32_256(s32 *a, s32 *b, s32* result) { + __asm__ ( + "vmovdqu %1, %%ymm0\n\t" + "vmovdqu %2, %%ymm1\n\t" + "vpmulld %%ymm1, %%ymm0, %%ymm0\n\t" + "vmovdqu %%ymm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "ymm0", "ymm1" + ); +} + +// AVX-512 float32 +inline void avx512_add_float32_512(float32 *a, float32 *b, float32* result) { + __asm__ ( + "vmovups %1, %%zmm0\n\t" + "vmovups %2, %%zmm1\n\t" + "vaddps %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovups %%zmm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "zmm0", "zmm1" + ); +} + +inline void avx512_sub_float32_512(float32 *a, float32 *b, float32* result) { + __asm__ ( + "vmovups %1, %%zmm0\n\t" + "vmovups %2, %%zmm1\n\t" + "vsubps %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovups %%zmm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "zmm0", "zmm1" + ); +} + +inline void avx512_mul_float32_512(float32 *a, float32 *b, float32* result) { + __asm__ ( + "vmovups %1, %%zmm0\n\t" + "vmovups %2, %%zmm1\n\t" + "vmulps %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovups %%zmm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "zmm0", "zmm1" + ); +} + +inline void avx512_div_float32_512(float32 *a, float32 *b, float32* result) { + __asm__ ( + "vmovups %1, %%zmm0\n\t" + "vmovups %2, %%zmm1\n\t" + "vdivps %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovups %%zmm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "zmm0", "zmm1" + ); +} + +// AVX-512 int32 +inline void avx512_add_int32_512(s32 *a, s32 *b, s32* result) { + __asm__ ( + "vmovdqu32 %1, %%zmm0\n\t" + "vmovdqu32 %2, %%zmm1\n\t" + "vpaddd %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqu32 %%zmm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "zmm0", "zmm1" + ); +} + +inline void avx512_sub_int32_512(s32 *a, s32 *b, s32* result) { + __asm__ ( + "vmovdqu32 %1, %%zmm0\n\t" + "vmovdqu32 %2, %%zmm1\n\t" + "vpsubd %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqu32 %%zmm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "zmm0", "zmm1" + ); +} + +inline void avx512_mul_int32_512(s32 *a, s32 *b, s32* result) { + __asm__ ( + "vmovdqu32 %1, %%zmm0\n\t" + "vmovdqu32 %2, %%zmm1\n\t" + "vpmulld %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqu32 %%zmm0, %0\n\t" + : "=m" (*result) + : "m" (*a), "m" (*b) + : "zmm0", "zmm1" + ); +} + +#else + // SSE 1 float32 + #define sse1_add_float32_64 basic_add_float32_64 + #define sse1_add_float32_128 basic_add_float32_128 + #define sse1_sub_float32_64 basic_sub_float32_64 + #define sse1_sub_float32_128 basic_sub_float32_128 + #define sse1_mul_float32_64 basic_mul_float32_64 + #define sse1_mul_float32_128 basic_mul_float32_128 + #define sse1_div_float32_64 basic_div_float32_64 + #define sse1_div_float32_128 basic_div_float32_128 + #define sse2_add_float32_64 basic_add_float32_64 + #define sse2_add_float32_128 basic_add_float32_128 + #define sse2_sub_float32_64 basic_sub_float32_64 + #define sse2_sub_float32_128 basic_sub_float32_128 + #define sse2_mul_float32_64 basic_mul_float32_64 + #define sse2_mul_float32_128 basic_mul_float32_128 + #define sse2_div_float32_64 basic_div_float32_64 + #define sse2_div_float32_128 basic_div_float32_128 + #define sse2_add_int32_128 basic_add_int32_128 + #define sse2_sub_int32_128 basic_sub_int32_128 + #define sse2_mul_int32_128 basic_mul_int32_128 + #define sse3_add_float32_64 basic_add_float32_64 + #define sse3_add_float32_128 basic_add_float32_128 + #define sse3_sub_float32_64 basic_sub_float32_64 + #define sse3_sub_float32_128 basic_sub_float32_128 + #define sse3_mul_float32_64 basic_mul_float32_64 + #define sse3_mul_float32_128 basic_mul_float32_128 + #define sse3_div_float32_64 basic_div_float32_64 + #define sse3_div_float32_128 basic_div_float32_128 + #define ssse3_add_float32_64 ssse3_add_float32_64 + #define ssse3_add_float32_128 basic_add_float32_128 + #define ssse3_sub_float32_64 basic_sub_float32_64 + #define ssse3_sub_float32_128 basic_sub_float32_128 + #define ssse3_mul_float32_64 basic_mul_float32_64 + #define ssse3_mul_float32_128 basic_mul_float32_128 + #define ssse3_div_float32_64 basic_div_float32_64 + #define ssse3_div_float32_128 basic_div_float32_128 + #define sse41_add_float32_64 basic_add_float32_64 + #define sse41_add_float32_128 basic_add_float32_128 + #define sse41_sub_float32_64 basic_sub_float32_64 + #define sse41_sub_float32_128 basic_sub_float32_128 + #define sse41_mul_float32_64 basic_mul_float32_64 + #define sse41_mul_float32_128 basic_mul_float32_128 + #define sse41_div_float32_64 basic_div_float32_64 + #define sse41_div_float32_128 basic_div_float32_128 + #define sse42_add_float32_64 basic_add_float32_64 + #define sse42_add_float32_128 basic_add_float32_128 + #define sse42_sub_float32_64 basic_sub_float32_64 + #define sse42_sub_float32_128 basic_sub_float32_128 + #define sse42_mul_float32_64 basic_mul_float32_64 + #define sse42_mul_float32_128 basic_mul_float32_128 + #define sse42_div_float32_64 basic_div_float32_64 + #define sse42_div_float32_128 basic_div_float32_128 + #define avx_add_float32_256 basic_add_float32_256 + #define avx_sub_float32_256 basic_sub_float32_256 + #define avx_mul_float32_256 basic_mul_float32_256 + #define avx_div_float32_256 basic_div_float32_256 + #define avx2_add_int32_256 basic_add_int32_256 + #define avx2_sub_int32_256 basic_sub_int32_256 + #define avx2_mul_int32_256 basic_mul_int32_256 + #define avx512_add_float32_512 basic_add_float32_512 + #define avx512_sub_float32_512 basic_sub_float32_512 + #define avx512_mul_float32_512 basic_mul_float32_512 + #define avx512_div_float32_512 basic_div_float32_512 + #define avx512_add_int32_512 basic_add_int32_512 + #define avx512_sub_int32_512 basic_sub_int32_512 + #define avx512_mul_int32_512 basic_mul_int32_512 +#endif // compiler check + +#endif // ENABLE_SIMD + + +void init_cpu_specific() { +#if ENABLE_SIMD + Cpu_Capabilities cap = query_cpu_capabilities(); + + if (cap.avx) { + simd_add_float32_256 = avx_add_float32_256; + simd_sub_float32_256 = avx_sub_float32_256; + simd_mul_float32_256 = avx_mul_float32_256; + simd_div_float32_256 = avx_div_float32_256; + } else { + simd_add_float32_256 = basic_add_float32_256; + simd_sub_float32_256 = basic_sub_float32_256; + simd_mul_float32_256 = basic_mul_float32_256; + simd_div_float32_256 = basic_div_float32_256; + } + + if (cap.avx2) { + simd_add_int32_256 = avx2_add_int32_256; + simd_sub_int32_256 = avx2_sub_int32_256; + simd_mul_int32_256 = avx2_mul_int32_256; + } else { + simd_add_int32_256 = basic_add_int32_256; + simd_sub_int32_256 = basic_sub_int32_256; + simd_mul_int32_256 = basic_mul_int32_256; + } + + if (cap.avx512) { + simd_add_float32_512 = avx512_add_float32_512; + simd_sub_float32_512 = avx512_sub_float32_512; + simd_mul_float32_512 = avx512_mul_float32_512; + simd_div_float32_512 = avx512_div_float32_512; + simd_add_int32_512 = avx512_add_int32_512; + simd_sub_int32_512 = avx512_sub_int32_512; + simd_mul_int32_512 = avx512_mul_int32_512; + } else { + simd_add_float32_512 = basic_add_float32_512; + simd_sub_float32_512 = basic_sub_float32_512; + simd_mul_float32_512 = basic_mul_float32_512; + simd_div_float32_512 = basic_div_float32_512; + simd_add_int32_512 = basic_add_int32_512; + simd_sub_int32_512 = basic_sub_int32_512; + simd_mul_int32_512 = basic_mul_int32_512; + } + + if (cap.sse2) { + simd_add_int32_128 = sse2_add_int32_128; + simd_sub_int32_128 = sse2_sub_int32_128; + simd_mul_int32_128 = sse2_mul_int32_128; + } else { + simd_add_int32_128 = basic_add_int32_128; + simd_sub_int32_128 = basic_sub_int32_128; + simd_mul_int32_128 = basic_mul_int32_128; + } + + if (cap.sse42) { + simd_add_float32_64 = sse42_add_float32_64; + simd_add_float32_128 = sse42_add_float32_128; + simd_sub_float32_64 = sse42_sub_float32_64; + simd_sub_float32_128 = sse42_sub_float32_128; + simd_mul_float32_64 = sse42_mul_float32_64; + simd_mul_float32_128 = sse42_mul_float32_128; + simd_div_float32_64 = sse42_div_float32_64; + simd_div_float32_128 = sse42_div_float32_128; + } else if (cap.sse41) { + simd_add_float32_64 = sse41_add_float32_64; + simd_add_float32_128 = sse41_add_float32_128; + simd_sub_float32_64 = sse41_sub_float32_64; + simd_sub_float32_128 = sse41_sub_float32_128; + simd_mul_float32_64 = sse41_mul_float32_64; + simd_mul_float32_128 = sse41_mul_float32_128; + simd_div_float32_64 = sse41_div_float32_64; + simd_div_float32_128 = sse41_div_float32_128; + } else if (cap.ssse3) { + simd_add_float32_64 = ssse3_add_float32_64; + simd_add_float32_128 = ssse3_add_float32_128; + simd_sub_float32_64 = ssse3_sub_float32_64; + simd_sub_float32_128 = ssse3_sub_float32_128; + simd_mul_float32_64 = ssse3_mul_float32_64; + simd_mul_float32_128 = ssse3_mul_float32_128; + simd_div_float32_64 = ssse3_div_float32_64; + simd_div_float32_128 = ssse3_div_float32_128; + } else if (cap.sse3) { + simd_add_float32_64 = sse3_add_float32_64; + simd_add_float32_128 = sse3_add_float32_128; + simd_sub_float32_64 = sse3_sub_float32_64; + simd_sub_float32_128 = sse3_sub_float32_128; + simd_mul_float32_64 = sse3_mul_float32_64; + simd_mul_float32_128 = sse3_mul_float32_128; + simd_div_float32_64 = sse3_div_float32_64; + simd_div_float32_128 = sse3_div_float32_128; + } else if (cap.sse2) { + simd_add_float32_64 = sse2_add_float32_64; + simd_add_float32_128 = sse2_add_float32_128; + simd_sub_float32_64 = sse2_sub_float32_64; + simd_sub_float32_128 = sse2_sub_float32_128; + simd_mul_float32_64 = sse2_mul_float32_64; + simd_mul_float32_128 = sse2_mul_float32_128; + simd_div_float32_64 = sse2_div_float32_64; + simd_div_float32_128 = sse2_div_float32_128; + } else if (cap.sse1) { + simd_add_float32_64 = sse1_add_float32_64; + simd_add_float32_128 = sse1_add_float32_128; + simd_sub_float32_64 = sse1_sub_float32_64; + simd_sub_float32_128 = sse1_sub_float32_128; + simd_mul_float32_64 = sse1_mul_float32_64; + simd_mul_float32_128 = sse1_mul_float32_128; + simd_div_float32_64 = sse1_div_float32_64; + simd_div_float32_128 = sse1_div_float32_128; + } else { + simd_add_float32_64 = basic_add_float32_64; + simd_add_float32_128 = basic_add_float32_128; + simd_sub_float32_64 = basic_sub_float32_64; + simd_sub_float32_128 = basic_sub_float32_128; + simd_mul_float32_64 = basic_mul_float32_64; + simd_mul_float32_128 = basic_mul_float32_128; + simd_div_float32_64 = basic_div_float32_64; + simd_div_float32_128 = basic_div_float32_128; + } +#else // ENABLE_SIMD + + simd_add_float32_64 = basic_add_float32_64; + simd_add_float32_128 = basic_add_float32_128; + simd_add_float32_256 = basic_add_float32_256; + simd_add_float32_512 = basic_add_float32_512; + simd_sub_float32_64 = basic_sub_float32_64; + simd_sub_float32_128 = basic_sub_float32_128; + simd_sub_float32_256 = basic_sub_float32_256; + simd_sub_float32_512 = basic_sub_float32_512; + simd_mul_float32_64 = basic_mul_float32_64; + simd_mul_float32_128 = basic_mul_float32_128; + simd_mul_float32_256 = basic_mul_float32_256; + simd_mul_float32_512 = basic_mul_float32_512; + simd_div_float32_64 = basic_div_float32_64; + simd_div_float32_128 = basic_div_float32_128; + simd_div_float32_256 = basic_div_float32_256; + simd_div_float32_512 = basic_div_float32_512; + simd_add_int32_128 = basic_add_int32_128; + simd_add_int32_256 = basic_add_int32_256; + simd_add_int32_512 = basic_add_int32_512; + simd_sub_int32_128 = basic_sub_int32_128; + simd_sub_int32_256 = basic_sub_int32_256; + simd_sub_int32_512 = basic_sub_int32_512; + simd_mul_int32_128 = basic_mul_int32_128; + simd_mul_int32_256 = basic_mul_int32_256; + simd_mul_int32_512 = basic_mul_int32_512; + +#endif // ENABLE_SIMD +} diff --git a/oogabooga/linmath.c b/oogabooga/linmath.c index 967e2ba..51eb229 100644 --- a/oogabooga/linmath.c +++ b/oogabooga/linmath.c @@ -14,94 +14,128 @@ #define to_degrees32 to_degrees typedef union Vector2 { - struct {float x, y;}; + struct {float32 x, y;}; } Vector2; -inline Vector2 v2(float x, float y) { return (Vector2){x, y}; } +inline Vector2 v2(float32 x, float32 y) { return (Vector2){x, y}; } #define v2_expand(v) (v).x, (v).y typedef union Vector3 { - struct {float x, y, z;}; - struct {float r, g, b;}; + struct {float32 x, y, z;}; + struct {float32 r, g, b;}; struct {Vector2 xy;}; - struct {float _x; Vector2 yz;}; + struct {float32 _x; Vector2 yz;}; } Vector3; -inline Vector3 v3(float x, float y, float z) { return (Vector3){x, y, z}; } +inline Vector3 v3(float32 x, float32 y, float32 z) { return (Vector3){x, y, z}; } #define v3_expand(v) (v).x, (v).y, (v).z typedef union Vector4 { - struct {float x, y, z, w;}; - struct {float x1, y1, x2, y2;}; - struct {float r, g, b, a;}; - struct {float left, bottom, right, top;}; + struct {float32 x, y, z, w;}; + struct {float32 x1, y1, x2, y2;}; + struct {float32 r, g, b, a;}; + struct {float32 left, bottom, right, top;}; struct {Vector2 xy; Vector2 zw;}; struct {Vector3 xyz;}; - struct {float _x; Vector3 yzw;}; + struct {float32 _x; Vector3 yzw;}; } Vector4; -inline Vector4 v4(float x, float y, float z, float w) { return (Vector4){x, y, z, w}; } +inline Vector4 v4(float32 x, float32 y, float32 z, float32 w) { return (Vector4){x, y, z, w}; } #define v4_expand(v) (v).x, (v).y, (v).z, (v).w -// #Simd #Speed - inline Vector2 v2_add(Vector2 a, Vector2 b) { - return v2(a.x + b.x, a.y + b.y); -} -inline Vector3 v3_add(Vector3 a, Vector3 b) { - return v3(a.x + b.x, a.y + b.y, a.z + b.z); -} -inline Vector4 v4_add(Vector4 a, Vector4 b) { - return v4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); + simd_add_float32_64((f32*)&a, (f32*)&b, (f32*)&a); + return a; } inline Vector2 v2_sub(Vector2 a, Vector2 b) { - return v2(a.x - b.x, a.y - b.y); -} -inline Vector3 v3_sub(Vector3 a, Vector3 b) { - return v3(a.x - b.x, a.y - b.y, a.z - b.z); -} -inline Vector4 v4_sub(Vector4 a, Vector4 b) { - return v4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); + simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); + return a; } inline Vector2 v2_mul(Vector2 a, Vector2 b) { - return v2(a.x * b.x, a.y * b.y); + simd_mul_float32_64((f32*)&a, (f32*)&b, (f32*)&a); + return a; } -inline Vector2 v2_mulf(Vector2 a, float b) { - return v2(a.x * b, a.y * b); -} -inline Vector3 v3_mul(Vector3 a, Vector3 b) { - return v3(a.x * b.x, a.y * b.y, a.z * b.z); -} -inline Vector4 v4_mul(Vector4 a, Vector4 b) { - return v4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +inline Vector2 v2_mulf(Vector2 a, float32 s) { + return v2_mul(a, v2(s, s)); } inline Vector2 v2_div(Vector2 a, Vector2 b) { - return v2(a.x / b.x, a.y / b.y); + simd_div_float32_64((f32*)&a, (f32*)&b, (f32*)&a); + return a; +} +inline Vector2 v2_divf(Vector2 a, float32 s) { + return v2_div(a, v2(s, s)); +} + +inline Vector3 v3_add(Vector3 a, Vector3 b) { + simd_add_float32_64((f32*)&a, (f32*)&b, (f32*)&a); + a.z += b.z; + return a; +} +inline Vector3 v3_sub(Vector3 a, Vector3 b) { + simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); + a.z -= b.z; + return a; +} +inline Vector3 v3_mul(Vector3 a, Vector3 b) { + simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); + a.z *= b.z; + return a; +} +inline Vector3 v3_mulf(Vector3 a, float32 s) { + return v3_mul(a, v3(s, s, s)); } inline Vector3 v3_div(Vector3 a, Vector3 b) { - return v3(a.x / b.x, a.y / b.y, a.z / b.z); + simd_sub_float32_64((f32*)&a, (f32*)&b, (f32*)&a); + a.z /= b.z; + return a; +} +inline Vector3 v3_divf(Vector3 a, float32 s) { + return v3_div(a, v3(s, s, s)); +} + +inline Vector4 v4_add(Vector4 a, Vector4 b) { + simd_add_float32_128((f32*)&a, (f32*)&b, (f32*)&a); + return a; +} +inline Vector4 v4_sub(Vector4 a, Vector4 b) { + simd_sub_float32_128((f32*)&a, (f32*)&b, (f32*)&a); + return a; +} +inline Vector4 v4_mul(Vector4 a, Vector4 b) { + simd_mul_float32_128((f32*)&a, (f32*)&b, (f32*)&a); + return a; +} +inline Vector4 v4_mulf(Vector4 a, float32 s) { + return v4_mul(a, v4(s, s, s, s)); } inline Vector4 v4_div(Vector4 a, Vector4 b) { - return v4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); + simd_div_float32_128((f32*)&a, (f32*)&b, (f32*)&a); + return a; } +inline Vector4 v4_divf(Vector4 a, float32 s) { + return v4_div(a, v4(s, s, s, s)); +} + inline Vector2 v2_normalize(Vector2 a) { - float length = sqrt(a.x * a.x + a.y * a.y); + float32 length = sqrt(a.x * a.x + a.y * a.y); if (length == 0) { return (Vector2){0, 0}; } - return (Vector2){a.x / length, a.y / length}; + return v2_divf(a, length); } -Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float rotation_radians) { - float s = sin(rotation_radians); - float c = cos(rotation_radians); +Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float32 rotation_radians) { + float32 s = sin(rotation_radians); + float32 c = cos(rotation_radians); point.x -= pivot.x; point.y -= pivot.y; + point = v2_sub(point, pivot); - float x_new = point.x * c - point.y * s; - float y_new = point.x * s + point.y * c; + float32 x_new = point.x * c - point.y * s; + float32 y_new = point.x * s + point.y * c; point.x = x_new + pivot.x; point.y = y_new + pivot.y; + point = v2_add(v2(x_new, y_new), pivot); return point; } @@ -112,10 +146,10 @@ Vector2 v2_rotate_point_around_pivot(Vector2 point, Vector2 pivot, float rotatio typedef struct Matrix4 { - union {float m[4][4]; float data[16]; }; + union {float32 m[4][4]; float32 data[16]; }; } Matrix4; -Matrix4 m4_scalar(float scalar) { +Matrix4 m4_scalar(float32 scalar) { Matrix4 m; for (int i = 0; i < 16; i++) { m.data[i] = 0.0f; @@ -136,11 +170,11 @@ Matrix4 m4_make_translation(Vector3 translation) { return m; } -Matrix4 m4_make_rotation(Vector3 axis, float radians) { +Matrix4 m4_make_rotation(Vector3 axis, float32 radians) { Matrix4 m = m4_scalar(1.0); - float c = cosf(radians); - float s = sinf(radians); - float t = 1.0f - c; + float32 c = cosf(radians); + float32 s = sinf(radians); + float32 t = 1.0f - c; m.m[0][0] = c + axis.x * axis.x * t; m.m[0][1] = axis.x * axis.y * t + axis.z * s; @@ -158,7 +192,7 @@ Matrix4 m4_make_rotation(Vector3 axis, float radians) { return m; } -inline Matrix4 m4_make_rotation_z(float radians) { +inline Matrix4 m4_make_rotation_z(float32 radians) { return m4_make_rotation(v3(0, 0, 1), radians); } @@ -189,11 +223,11 @@ inline Matrix4 m4_translate(Matrix4 m, Vector3 translation) { return m4_multiply(m, translation_matrix); } -inline Matrix4 m4_rotate(Matrix4 m, Vector3 axis, float radians) { +inline Matrix4 m4_rotate(Matrix4 m, Vector3 axis, float32 radians) { Matrix4 rotation_matrix = m4_make_rotation(axis, radians); return m4_multiply(m, rotation_matrix); } -inline Matrix4 m4_rotate_z(Matrix4 m, float radians) { +inline Matrix4 m4_rotate_z(Matrix4 m, float32 radians) { Matrix4 rotation_matrix = m4_make_rotation(v3(0, 0, 1), radians); return m4_multiply(m, rotation_matrix); } @@ -205,7 +239,7 @@ inline Matrix4 m4_scale(Matrix4 m, Vector3 scale) { // _near & _far because microsoft... -Matrix4 m4_make_orthographic_projection(float left, float right, float bottom, float top, float _near, float _far) { +Matrix4 m4_make_orthographic_projection(float32 left, float32 right, float32 bottom, float32 top, float32 _near, float32 _far) { Matrix4 m = m4_scalar(1.0f); m.m[0][0] = 2.0f / (right - left); m.m[1][1] = 2.0f / (top - bottom); @@ -227,7 +261,7 @@ Vector4 m4_transform(Matrix4 m, Vector4 v) { } Matrix4 m4_inverse(Matrix4 m) { Matrix4 inv; - float det; + float32 det; inv.m[0][0] = m.m[1][1] * m.m[2][2] * m.m[3][3] - m.m[1][1] * m.m[2][3] * m.m[3][2] - diff --git a/oogabooga/oogabooga.c b/oogabooga/oogabooga.c index 1c37fd4..37142ce 100644 --- a/oogabooga/oogabooga.c +++ b/oogabooga/oogabooga.c @@ -22,6 +22,11 @@ #define DO_ZERO_INITIALIZATION 1 #endif +#ifndef ENABLE_SIMD + #define ENABLE_SIMD 1 +#endif + + #define WINDOWS 0 #define LINUX 1 #define MACOS 2 @@ -68,7 +73,7 @@ void lodepng_free(void* ptr) { ///// - +#include "cpu.c" #ifdef _WIN32 #include @@ -211,10 +216,21 @@ void _profiler_report_time_cycles(string name, u64 count, u64 start) { void oogabooga_init(u64 program_memory_size) { context.logger = default_logger; temp = get_initialization_allocator(); + init_cpu_specific(); + Cpu_Capabilities features = query_cpu_capabilities(); os_init(program_memory_size); heap_init(); temporary_storage_init(); gfx_init(); + log_verbose("CPU has sse1: %cs", features.sse1 ? "true" : "false"); + log_verbose("CPU has sse2: %cs", features.sse2 ? "true" : "false"); + log_verbose("CPU has sse3: %cs", features.sse3 ? "true" : "false"); + log_verbose("CPU has ssse3: %cs", features.ssse3 ? "true" : "false"); + log_verbose("CPU has sse41: %cs", features.sse41 ? "true" : "false"); + log_verbose("CPU has sse42: %cs", features.sse42 ? "true" : "false"); + log_verbose("CPU has avx: %cs", features.avx ? "true" : "false"); + log_verbose("CPU has avx2: %cs", features.avx2 ? "true" : "false"); + log_verbose("CPU has avx512: %cs", features.avx512 ? "true" : "false"); } #ifndef INITIAL_PROGRAM_MEMORY_SIZE @@ -229,7 +245,6 @@ int ENTRY_PROC(int argc, char **argv); int main(int argc, char **argv) { - printf("Ooga booga program started\n"); oogabooga_init(INITIAL_PROGRAM_MEMORY_SIZE); diff --git a/oogabooga/tests.c b/oogabooga/tests.c index a2af6b5..38964a5 100644 --- a/oogabooga/tests.c +++ b/oogabooga/tests.c @@ -539,8 +539,202 @@ void test_file_io() { delete_ok = os_file_delete("integers"); assert(delete_ok, "Failed: could not delete integers"); } +void test_simd() { + u64 start = os_get_current_cycle_count(); + + // Setup test data + float32 a_f32[32], b_f32[32], result_f32[32]; + s32 a_i32[16], b_i32[16], result_i32[16]; + + for (int i = 0; i < 16; ++i) { + a_f32[i] = i * 1.0f; + b_f32[i] = (i + 1) * 2.0f; + a_i32[i] = i; + b_i32[i] = i + 1; + } + + // Test function pointers setup + query_cpu_capabilities(); + + // Test float32 add + simd_add_float32_64(a_f32, b_f32, result_f32); + assert(result_f32[0] == a_f32[0]+b_f32[0], "SIMD add float32 64 failed"); + + simd_add_float32_128(a_f32, b_f32, result_f32); + for (int i = 0; i < 4; ++i) { + assert(result_f32[i] == a_f32[i] + b_f32[i], "SIMD add float32 128 failed"); + } + + simd_add_float32_256(a_f32, b_f32, result_f32); + for (int i = 0; i < 8; ++i) { + assert(result_f32[i] == a_f32[i] + b_f32[i], "SIMD add float32 256 failed"); + } + + simd_add_float32_512(a_f32, b_f32, result_f32); + for (int i = 0; i < 16; ++i) { + assert(result_f32[i] == a_f32[i] + b_f32[i], "SIMD add float32 512 failed"); + } + + // Test float32 subtract + simd_sub_float32_64(a_f32, b_f32, result_f32); + assert(result_f32[0] == a_f32[0]-b_f32[0], "SIMD sub float32 64 failed"); + + simd_sub_float32_128(a_f32, b_f32, result_f32); + for (int i = 0; i < 4; ++i) { + assert(result_f32[i] == a_f32[i] - b_f32[i], "SIMD sub float32 128 failed"); + } + + simd_sub_float32_256(a_f32, b_f32, result_f32); + for (int i = 0; i < 8; ++i) { + assert(result_f32[i] == a_f32[i] - b_f32[i], "SIMD sub float32 256 failed"); + } + + simd_sub_float32_512(a_f32, b_f32, result_f32); + for (int i = 0; i < 16; ++i) { + assert(result_f32[i] == a_f32[i] - b_f32[i], "SIMD sub float32 512 failed"); + } + + // Test float32 multiply + simd_mul_float32_64(a_f32, b_f32, result_f32); + assert(result_f32[0] == a_f32[0]*b_f32[0], "SIMD mul float32 64 failed"); + + simd_mul_float32_128(a_f32, b_f32, result_f32); + for (int i = 0; i < 4; ++i) { + assert(result_f32[i] == a_f32[i] * b_f32[i], "SIMD mul float32 128 failed"); + } + + simd_mul_float32_256(a_f32, b_f32, result_f32); + for (int i = 0; i < 8; ++i) { + assert(result_f32[i] == a_f32[i] * b_f32[i], "SIMD mul float32 256 failed"); + } + + simd_mul_float32_512(a_f32, b_f32, result_f32); + for (int i = 0; i < 16; ++i) { + assert(result_f32[i] == a_f32[i] * b_f32[i], "SIMD mul float32 512 failed"); + } + + // Test float32 divide + simd_div_float32_64(a_f32, b_f32, result_f32); + assert(result_f32[0] == a_f32[0]/b_f32[0], "SIMD div float32 64 failed"); + + simd_div_float32_128(a_f32, b_f32, result_f32); + for (int i = 0; i < 4; ++i) { + assert(result_f32[i] == a_f32[i] / b_f32[i], "SIMD div float32 128 failed"); + } + + simd_div_float32_256(a_f32, b_f32, result_f32); + for (int i = 0; i < 8; ++i) { + assert(result_f32[i] == a_f32[i] / b_f32[i], "SIMD div float32 256 failed"); + } + + simd_div_float32_512(a_f32, b_f32, result_f32); + for (int i = 0; i < 16; ++i) { + assert(result_f32[i] == a_f32[i] / b_f32[i], "SIMD div float32 512 failed"); + } + + // Test int32 add + simd_add_int32_128(a_i32, b_i32, result_i32); + for (int i = 0; i < 4; ++i) { + assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 128 failed"); + } + + simd_add_int32_256(a_i32, b_i32, result_i32); + for (int i = 0; i < 8; ++i) { + assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 256 failed"); + } + + simd_add_int32_512(a_i32, b_i32, result_i32); + for (int i = 0; i < 16; ++i) { + assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 512 failed"); + } + + // Test int32 subtract + simd_sub_int32_128(a_i32, b_i32, result_i32); + for (int i = 0; i < 4; ++i) { + assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 128 failed"); + } + + simd_sub_int32_256(a_i32, b_i32, result_i32); + for (int i = 0; i < 8; ++i) { + assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 256 failed"); + } + + simd_sub_int32_512(a_i32, b_i32, result_i32); + for (int i = 0; i < 16; ++i) { + assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 512 failed"); + } + + // Test int32 multiply + simd_mul_int32_128(a_i32, b_i32, result_i32); + for (int i = 0; i < 4; ++i) { + assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 128 failed"); + } + + simd_mul_int32_256(a_i32, b_i32, result_i32); + for (int i = 0; i < 8; ++i) { + assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 256 failed"); + } + + simd_mul_int32_512(a_i32, b_i32, result_i32); + for (int i = 0; i < 16; ++i) { + assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 512 failed"); + } + + // Stress test with random values + for (int i = 0; i < 16; ++i) { + a_f32[i] = (float32)get_random(); + b_f32[i] = (float32)get_random(); + a_i32[i] = (s32)get_random(); + b_i32[i] = (s32)get_random(); + } + + simd_add_float32_512(a_f32, b_f32, result_f32); + for (int i = 0; i < 16; ++i) { + assert(result_f32[i] == a_f32[i] + b_f32[i], "SIMD add float32 stress test failed"); + } + + simd_sub_float32_512(a_f32, b_f32, result_f32); + for (int i = 0; i < 16; ++i) { + assert(result_f32[i] == a_f32[i] - b_f32[i], "SIMD sub float32 stress test failed"); + } + + simd_mul_float32_512(a_f32, b_f32, result_f32); + for (int i = 0; i < 16; ++i) { + assert(result_f32[i] == a_f32[i] * b_f32[i], "SIMD mul float32 stress test failed"); + } + + simd_div_float32_512(a_f32, b_f32, result_f32); + for (int i = 0; i < 16; ++i) { + assert(result_f32[i] == a_f32[i] / b_f32[i], "SIMD div float32 stress test failed"); + } + + simd_add_int32_512(a_i32, b_i32, result_i32); + for (int i = 0; i < 16; ++i) { + assert(result_i32[i] == a_i32[i] + b_i32[i], "SIMD add int32 stress test failed"); + } + + simd_sub_int32_512(a_i32, b_i32, result_i32); + for (int i = 0; i < 16; ++i) { + assert(result_i32[i] == a_i32[i] - b_i32[i], "SIMD sub int32 stress test failed"); + } + + simd_mul_int32_512(a_i32, b_i32, result_i32); + for (int i = 0; i < 16; ++i) { + assert(result_i32[i] == a_i32[i] * b_i32[i], "SIMD mul int32 stress test failed"); + } + + u64 end = os_get_current_cycle_count(); + + u64 cycles = end-start; + + print(" simd took %llu cycles ", cycles); +} void oogabooga_run_tests() { + print("Testing simd... "); + test_simd(); + print("OK!\n"); + print("Testing allocator... "); test_allocator(true); print("OK!\n"); @@ -553,6 +747,7 @@ void oogabooga_run_tests() { test_strings(); print("OK!\n"); + print("Thread bombing allocator... "); Thread* threads[100]; for (int i = 0; i < 100; i++) { @@ -567,4 +762,5 @@ void oogabooga_run_tests() { print("Testing file IO... "); test_file_io(); print("OK!\n"); + } \ No newline at end of file