134 lines
4.2 KiB
C
134 lines
4.2 KiB
C
|
|
|
|
#define UTF16_SURROGATE_HIGH_START 0xD800
|
|
#define UTF16_SURROGATE_HIGH_END 0xDBFF
|
|
#define UTF16_SURROGATE_LOW_START 0xDC00
|
|
#define UTF16_SURROGATE_LOW_END 0xDFFF
|
|
#define UTF16_SURROGATE_OFFSET 0x10000
|
|
#define UTF16_SURROGATE_MASK 0x3FF
|
|
|
|
// Returns how many utf16 units were converted
|
|
int utf16_to_utf32(const u16 *utf16, u64 length, u32 *utf32) {
|
|
if (length == 0 || utf16 == NULL || utf32 == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
u16 first = utf16[0];
|
|
|
|
if (first >= UTF16_SURROGATE_HIGH_START && first <= UTF16_SURROGATE_HIGH_END) {
|
|
if (length < 2) {
|
|
return -1;
|
|
}
|
|
|
|
u16 second = utf16[1];
|
|
if (second >= UTF16_SURROGATE_LOW_START && second <= UTF16_SURROGATE_LOW_END) {
|
|
*utf32 = ((first - UTF16_SURROGATE_HIGH_START) << 10)
|
|
+ (second - UTF16_SURROGATE_LOW_START)
|
|
+ UTF16_SURROGATE_OFFSET;
|
|
return 2;
|
|
} else {
|
|
return -1;
|
|
}
|
|
} else if (first >= UTF16_SURROGATE_LOW_START && first <= UTF16_SURROGATE_LOW_END) {
|
|
return -1;
|
|
} else {
|
|
*utf32 = first;
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
// Yoinked from jai Unicode.jai
|
|
|
|
#define UNI_REPLACEMENT_CHAR 0x0000FFFD
|
|
#define UNI_MAX_UTF32 0x7FFFFFFF
|
|
#define UNI_MAX_UTF16 0x0010FFFF
|
|
#define SURROGATES_START 0xD800
|
|
#define SURROGATES_END 0xDFFF
|
|
|
|
typedef struct {
|
|
u32 utf32;
|
|
s64 continuation_bytes;
|
|
bool reached_end;
|
|
bool error;
|
|
} Utf8_To_Utf32_Result;
|
|
|
|
const u8 trailing_bytes_for_utf8[] = {
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
|
};
|
|
const u8 utf8_inital_byte_mask[] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
|
|
|
|
Utf8_To_Utf32_Result utf8_to_utf32(u8 *s, s64 source_length, bool strict) {
|
|
s64 continuation_bytes = trailing_bytes_for_utf8[s[0]];
|
|
|
|
if (continuation_bytes + 1 > source_length) {
|
|
return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, source_length, true, true};
|
|
}
|
|
|
|
u32 ch = s[0] & utf8_inital_byte_mask[continuation_bytes];
|
|
|
|
for (u64 i = 1; i <= continuation_bytes; i++) { // Do nothing if it is 0.
|
|
ch = ch << 6;
|
|
if (strict) if ((s[i] & 0xC0) != 0x80) return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, i - 1, true, true};
|
|
ch |= s[i] & 0x3F;
|
|
}
|
|
|
|
if (strict) {
|
|
if (ch > UNI_MAX_UTF16 ||
|
|
(SURROGATES_START <= ch && ch <= SURROGATES_END) ||
|
|
(ch <= 0x0000007F && continuation_bytes != 0) ||
|
|
(ch <= 0x000007FF && continuation_bytes != 1) ||
|
|
(ch <= 0x0000FFFF && continuation_bytes != 2) ||
|
|
continuation_bytes > 3) {
|
|
return (Utf8_To_Utf32_Result){UNI_REPLACEMENT_CHAR, continuation_bytes+1, true, true};
|
|
}
|
|
}
|
|
|
|
if (ch > UNI_MAX_UTF32) {
|
|
ch = UNI_REPLACEMENT_CHAR;
|
|
}
|
|
|
|
return (Utf8_To_Utf32_Result){ ch, continuation_bytes+1, false, false };
|
|
}
|
|
|
|
// Returns 0 on fail
|
|
u32 next_utf8(string *s) {
|
|
Utf8_To_Utf32_Result result = utf8_to_utf32(s->data, s->count, false);
|
|
|
|
s->data += result.continuation_bytes;
|
|
s->count -= result.continuation_bytes;
|
|
assert(s->count >= 0);
|
|
|
|
if (result.error) return 0;
|
|
|
|
return result.utf32;
|
|
}
|
|
|
|
u64 utf8_index_to_byte_index(string str, u64 index) {
|
|
u64 byte_index = 0;
|
|
u64 utf8_index = 0;
|
|
while (utf8_index < index && str.count != 0) {
|
|
string last_str = str;
|
|
u32 codepoint = next_utf8(&str);
|
|
if (!codepoint) break;
|
|
|
|
u64 byte_diff = ((u8*)str.data)-((u8*)last_str.data);
|
|
assert(byte_diff != 0);
|
|
byte_index += byte_diff;
|
|
utf8_index += 1;
|
|
}
|
|
return byte_index;
|
|
}
|
|
string utf8_slice(string str, u64 index, u64 count) {
|
|
u64 byte_index = utf8_index_to_byte_index(str, index);
|
|
u64 byte_end_index = utf8_index_to_byte_index(str, index+count);
|
|
u64 byte_count = byte_end_index - byte_index;
|
|
|
|
return string_view(str, byte_index, byte_count);
|
|
}
|