// This is a very niche sort algorithm. // I use it for Z sorting quads. // help_buffer should be same size as collection. // This only works with integers, and it will use the first number_of_bits in the integer // at sort_value_offset_in_item for sorting. // There is a cost of memory as we need to double the buffer we're sorting BUT the performance // gain is very promising. // At 21 bits I'm able to sort a completely randomized collection of 100k integers at around // 8m cycles (or 2.5-2.6ms on my shitty laptop i5-11300H) void radix_sort(void *collection, void *help_buffer, u64 item_count, u64 item_size, u64 sort_value_offset_in_item, u64 number_of_bits) { local_persist const int RADIX = 256; local_persist const int BITS_PER_PASS = 8; const int PASS_COUNT = ((number_of_bits + BITS_PER_PASS - 1) / BITS_PER_PASS); const u64 HALF_RANGE_OF_VALUE_BITS = 1ULL << (number_of_bits - 1); u64 count[RADIX]; u64 prefix_sum[RADIX]; for (u32 pass = 0; pass < PASS_COUNT; ++pass) { u32 shift = pass * BITS_PER_PASS; memset(count, 0, sizeof(count)); for (u64 i = 0; i < item_count; ++i) { u8 *item = (u8*)collection + i * item_size; u64 sort_value = *(u64*)(item + sort_value_offset_in_item); sort_value += HALF_RANGE_OF_VALUE_BITS; // We treat the value as a signed integer u32 digit = (sort_value >> shift) & (RADIX-1); ++count[digit]; } prefix_sum[0] = 0; for (u32 i = 1; i < RADIX; ++i) { prefix_sum[i] = prefix_sum[i - 1] + count[i - 1]; } for (u64 i = 0; i < item_count; ++i) { u8 *item = (u8*)collection + i * item_size; u64 sort_value = *(u64*)(item + sort_value_offset_in_item); sort_value += HALF_RANGE_OF_VALUE_BITS; // We treat the value as a signed integer u32 digit = (sort_value >> shift) & (RADIX-1); memcpy((u8*)help_buffer + prefix_sum[digit] * item_size, item, item_size); ++prefix_sum[digit]; } memcpy(collection, help_buffer, item_count * item_size); } } void merge_sort(void *collection, void *help_buffer, u64 item_count, u64 item_size, int (*compare)(const void *, const void *)) { u8 *items = (u8 *)collection; u8 *buffer = (u8 *)help_buffer; for (u64 width = 1; width < item_count; width *= 2) { for (u64 i = 0; i < item_count; i += 2 * width) { u64 left = i; u64 right = (i + width < item_count) ? (i + width) : item_count; u64 end = (i + 2 * width < item_count) ? (i + 2 * width) : item_count; u64 left_index = left; u64 right_index = right; u64 k = left; while (left_index < right && right_index < end) { if (compare(items + left_index * item_size, items + right_index * item_size) <= 0) { memcpy(buffer + k * item_size, items + left_index * item_size, item_size); left_index++; } else { memcpy(buffer + k * item_size, items + right_index * item_size, item_size); right_index++; } k++; } while (left_index < right) { memcpy(buffer + k * item_size, items + left_index * item_size, item_size); left_index++; k++; } while (right_index < end) { memcpy(buffer + k * item_size, items + right_index * item_size, item_size); right_index++; k++; } for (u64 j = left; j < end; j++) { memcpy(items + j * item_size, buffer + j * item_size, item_size); } } } } inline bool bytes_match(void *a, void *b, u64 count) { return memcmp(a, b, count) == 0; } #define swap(a, b, type) { type t = a; a = b; b = t; }